|
|
|
<?php
|
|
|
|
|
|
|
|
namespace App\Console\Commands\Update;
|
|
|
|
|
|
|
|
use App\Models\Collect\CollectSource;
|
|
|
|
use App\Models\Collect\CollectTask;
|
|
|
|
use App\Models\Com\UpdateLog;
|
|
|
|
use App\Services\CosService;
|
|
|
|
use App\Services\ProjectServer;
|
|
|
|
use Illuminate\Console\Command;
|
|
|
|
use Illuminate\Support\Facades\DB;
|
|
|
|
use Illuminate\Support\Facades\Redis;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 4.0,5.0升级到6.0,页面采集
|
|
|
|
* Class ProjectImport
|
|
|
|
* @package App\Console\Commands
|
|
|
|
* @author Akun
|
|
|
|
* @date 2023/11/10 16:04
|
|
|
|
*/
|
|
|
|
class HtmlCollect extends Command
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* The name and signature of the console command.
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
protected $signature = 'project_html_collect';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The console command description.
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
protected $description = '执行项目html页面采集';
|
|
|
|
|
|
|
|
|
|
|
|
public function handle()
|
|
|
|
{
|
|
|
|
// while (true) {
|
|
|
|
$this->start_update();
|
|
|
|
// }
|
|
|
|
}
|
|
|
|
|
|
|
|
protected function start_update()
|
|
|
|
{
|
|
|
|
// $task_id = $this->get_task();
|
|
|
|
$task_id = '298_1';
|
|
|
|
if ($task_id === false) {
|
|
|
|
//所有项目采集完成
|
|
|
|
sleep(60);
|
|
|
|
return true;
|
|
|
|
} elseif ($task_id === 0) {
|
|
|
|
//当前项目采集完成
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
$task_arr = explode('_', $task_id);
|
|
|
|
$project_id = $task_arr[0];
|
|
|
|
$collect_id = $task_arr[1];
|
|
|
|
|
|
|
|
//设置数据库
|
|
|
|
$project = ProjectServer::useProject($project_id);
|
|
|
|
if ($project) {
|
|
|
|
$collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->first();
|
|
|
|
|
|
|
|
if (!$collect_info) {
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect start' . PHP_EOL;
|
|
|
|
|
|
|
|
$collect_info->status = CollectTask::STATUS_ING;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
//采集html页面,下载资源到本地并替换
|
|
|
|
try {
|
|
|
|
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
|
|
|
|
$source_list = $this->html_preg($html, $project_id, $collect_info->domain);
|
|
|
|
|
|
|
|
if ($source_list) {
|
|
|
|
$html = $this->upload_source($html, $source_list, $project_id);
|
|
|
|
}
|
|
|
|
} catch (\Exception $e) {
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
$collect_info->html = $html;
|
|
|
|
$collect_info->status = CollectTask::STATUS_COM;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect end' . PHP_EOL;
|
|
|
|
}
|
|
|
|
//关闭数据库
|
|
|
|
DB::disconnect('custom_mysql');
|
|
|
|
|
|
|
|
sleep(2);
|
|
|
|
}
|
|
|
|
|
|
|
|
//获取任务
|
|
|
|
protected function get_task()
|
|
|
|
{
|
|
|
|
$key = 'console_html_collect_task';
|
|
|
|
$task_id = Redis::rpop($key);
|
|
|
|
if ($task_id) {
|
|
|
|
return $task_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
|
|
|
|
if (!$update_log) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
$complete = false;
|
|
|
|
//设置数据库
|
|
|
|
$project = ProjectServer::useProject($update_log->project_id);
|
|
|
|
if ($project) {
|
|
|
|
$collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('status', CollectTask::STATUS_UN)->limit(50)->get();
|
|
|
|
|
|
|
|
if ($collect_list->count() == 0) {
|
|
|
|
$complete = true;
|
|
|
|
} else {
|
|
|
|
foreach ($collect_list as $collect) {
|
|
|
|
Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
//关闭数据库
|
|
|
|
DB::disconnect('custom_mysql');
|
|
|
|
|
|
|
|
if ($complete) {
|
|
|
|
$update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
$task_id = Redis::rpop($key);
|
|
|
|
return $task_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
//正则匹配html资源
|
|
|
|
protected function html_preg($html, $project_id, $domain)
|
|
|
|
{
|
|
|
|
$source = [];
|
|
|
|
|
|
|
|
if (!$html) {
|
|
|
|
return $source;
|
|
|
|
}
|
|
|
|
|
|
|
|
//图片
|
|
|
|
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
|
|
|
$img = $result_img[2] ?? [];
|
|
|
|
foreach ($img as $vi) {
|
|
|
|
$check_vi = $this->url_check($vi, $project_id, $domain);
|
|
|
|
$check_vi && $source[] = $check_vi;
|
|
|
|
}
|
|
|
|
|
|
|
|
//js
|
|
|
|
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
|
|
|
|
$js = $result_js[2] ?? [];
|
|
|
|
foreach ($js as $vj) {
|
|
|
|
$check_vj = $this->url_check($vj, $project_id, $domain);
|
|
|
|
$check_vj && $source[] = $check_vj;
|
|
|
|
}
|
|
|
|
|
|
|
|
//video
|
|
|
|
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
|
|
|
|
$video = $result_video[2] ?? [];
|
|
|
|
foreach ($video as $vv) {
|
|
|
|
$check_vv = $this->url_check($vv, $project_id, $domain);
|
|
|
|
$check_vv && $source[] = $check_vv;
|
|
|
|
}
|
|
|
|
|
|
|
|
//css
|
|
|
|
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
|
|
|
|
$css = $result_css[2] ?? [];
|
|
|
|
foreach ($css as $vc) {
|
|
|
|
$check_vc = $this->url_check($vc, $project_id, $domain);
|
|
|
|
$check_vc && $source[] = $check_vc;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $source;
|
|
|
|
}
|
|
|
|
|
|
|
|
//判断资源是否需要下载
|
|
|
|
protected function url_check($url, $project_id, $domain)
|
|
|
|
{
|
|
|
|
if ($url) {
|
|
|
|
$arr = parse_url($url);
|
|
|
|
$scheme = $arr['scheme'] ?? '';
|
|
|
|
$host = $arr['host'] ?? '';
|
|
|
|
$path = $arr['path'] ?? '';
|
|
|
|
|
|
|
|
if ((strpos($host, '.globalso.') === false)
|
|
|
|
&& (strpos($host, '.goodao.') === false)
|
|
|
|
&& $path && (strpos($path, '.') !== false)) {
|
|
|
|
|
|
|
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
|
|
if (!$source) {
|
|
|
|
return [
|
|
|
|
'download' => true,
|
|
|
|
'url' => $url,
|
|
|
|
'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path
|
|
|
|
];
|
|
|
|
} else {
|
|
|
|
return [
|
|
|
|
'download' => false,
|
|
|
|
'url' => $url,
|
|
|
|
'url_complete' => $source['target']
|
|
|
|
];
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//下载并替换资源
|
|
|
|
protected function upload_source($html, $source, $project_id)
|
|
|
|
{
|
|
|
|
foreach ($source as $vs) {
|
|
|
|
|
|
|
|
if ($vs['download']) {
|
|
|
|
$new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
|
|
|
|
if ($new_source) {
|
|
|
|
CollectSource::insert([
|
|
|
|
'project_id' => $project_id,
|
|
|
|
'origin' => $vs['url'],
|
|
|
|
'target' => $new_source,
|
|
|
|
'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
]);
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($new_source), $html);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $html;
|
|
|
|
}
|
|
|
|
} |
...
|
...
|
|