作者 刘锟

update

... ... @@ -13,7 +13,7 @@ use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Redis;
/**
* 4.0,5.0升级到6.0,页面采集
* 4.0,5.0升级到6.0,主站页面采集
* Class ProjectImport
* @package App\Console\Commands
* @author Akun
... ... @@ -137,7 +137,7 @@ class HtmlCollect extends Command
//设置数据库
$project = ProjectServer::useProject($update_log->project_id);
if ($project) {
$collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('status', CollectTask::STATUS_UN)->orderBy('language', 'asc')->limit(50)->get();
$collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '')->where('status', CollectTask::STATUS_UN)->orderBy('language', 'asc')->limit(50)->get();
if ($collect_list->count() == 0) {
$complete = true;
... ... @@ -151,7 +151,7 @@ class HtmlCollect extends Command
DB::disconnect('custom_mysql');
if ($complete) {
$update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
$update_log->collect_status = UpdateLog::COLLECT_STATUS_MAIN;
return 0;
}
... ...
<?php
namespace App\Console\Commands\Update;
use App\Models\Collect\CollectSource;
use App\Models\Collect\CollectTask;
use App\Models\Com\UpdateLog;
use App\Models\RouteMap\RouteMap;
use App\Services\CosService;
use App\Services\ProjectServer;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Redis;
/**
* 4.0,5.0升级到6.0,小语种页面采集
* Class ProjectImport
* @package App\Console\Commands
* @author Akun
* @date 2023/11/20 14:04
*/
class HtmlLanguageCollect extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'project_html_language_collect';
/**
* The console command description.
*
* @var string
*/
protected $description = '执行项目html页面采集';
public function handle()
{
while (true) {
$this->start_collect();
}
}
protected function start_collect()
{
$task_id = $this->get_task();
if ($task_id === false) {
//所有项目采集完成
sleep(60);
return true;
} elseif ($task_id === 0) {
//当前项目采集完成
sleep(2);
return true;
}
$task_arr = explode('_', $task_id);
$project_id = $task_arr[0];
$collect_id = $task_arr[1];
//设置数据库
$project = ProjectServer::useProject($project_id);
if ($project) {
$collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->first();
if (!$collect_info) {
sleep(2);
return true;
}
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
$collect_info->status = CollectTask::STATUS_ING;
$collect_info->save();
//采集html页面,下载资源到本地并替换
try {
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
$source_list = $this->html_preg($html, $project_id, $collect_info->domain);
if ($source_list) {
$html = $this->upload_source($html, $source_list, $project_id);
}
} catch (\Exception $e) {
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
return true;
}
$collect_info->html = $html;
$collect_info->status = CollectTask::STATUS_COM;
$collect_info->save();
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
}
//关闭数据库
DB::disconnect('custom_mysql');
sleep(2);
}
//获取任务
protected function get_task()
{
$key = 'console_html_language_collect_task';
$task_id = Redis::rpop($key);
if ($task_id) {
return $task_id;
}
$update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_MAIN)->orderBy('project_id', 'asc')->first();
if (!$update_log) {
return false;
}
switch ($update_log->api_type) {
case 'page':
$source = RouteMap::SOURCE_PAGE;
break;
case 'news':
$source = RouteMap::SOURCE_NEWS;
break;
case 'blog':
$source = RouteMap::SOURCE_BLOG;
break;
case 'tag':
$source = RouteMap::SOURCE_PRODUCT_KEYWORD;
break;
default:
$source = RouteMap::SOURCE_PRODUCT;
break;
}
$complete = false;
//设置数据库
$project = ProjectServer::useProject($update_log->project_id);
if ($project) {
$collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '!=', '')->where('status', CollectTask::STATUS_UN)->orderBy('language', 'asc')->limit(50)->get();
if ($collect_list->count() == 0) {
$complete = true;
} else {
foreach ($collect_list as $collect) {
Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
}
}
}
//关闭数据库
DB::disconnect('custom_mysql');
if ($complete) {
$update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
return 0;
}
$task_id = Redis::rpop($key);
return $task_id;
}
//正则匹配html资源
protected function html_preg($html, $project_id, $domain)
{
$source = [];
if (!$html) {
return $source;
}
//image
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
$img = $result_img[2] ?? [];
foreach ($img as $vi) {
$check_vi = $this->url_check($vi, $project_id, $domain);
$check_vi && $source[] = $check_vi;
}
//js
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
$js = $result_js[2] ?? [];
foreach ($js as $vj) {
$check_vj = $this->url_check($vj, $project_id, $domain);
$check_vj && $source[] = $check_vj;
}
//video
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
$video = $result_video[2] ?? [];
foreach ($video as $vv) {
$check_vv = $this->url_check($vv, $project_id, $domain);
$check_vv && $source[] = $check_vv;
}
//css
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
$css = $result_css[2] ?? [];
foreach ($css as $vc) {
$check_vc = $this->url_check($vc, $project_id, $domain);
$check_vc && $source[] = $check_vc;
}
//css background
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
$css_b = $result_css_b[1] ?? [];
foreach ($css_b as $vc_b) {
$check_vc_b = $this->url_check($vc_b, $project_id, $domain);
$check_vc_b && $source[] = $check_vc_b;
}
return $source;
}
//判断资源是否需要下载
protected function url_check($url, $project_id, $domain)
{
if ($url) {
$arr = parse_url($url);
$scheme = $arr['scheme'] ?? '';
$host = $arr['host'] ?? '';
$path = $arr['path'] ?? '';
if (
(strpos($host, '.globalso.') === false) &&
(strpos($host, '.goodao.') === false) &&
$path && (strpos($path, '.') !== false)
) {
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
if (!$source) {
return [
'download' => true,
'url' => $url,
'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path
];
} else {
return [
'download' => false,
'url' => $url,
'url_complete' => $source['target']
];
}
} else {
return false;
}
} else {
return false;
}
}
//下载并替换资源
protected function upload_source($html, $source, $project_id)
{
foreach ($source as $vs) {
if ($vs['download']) {
$new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
if ($new_source) {
CollectSource::insert([
'project_id' => $project_id,
'origin' => $vs['url'],
'target' => $new_source,
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
$html = str_replace($vs['url'], getImageUrl($new_source), $html);
if (substr($new_source, -3, 3) == 'css') {
// 下载css文件中的资源
$css_html = file_get_contents($vs['url_complete']);
preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
$css_source = $result_css_source[1] ?? [];
$url_arr = explode('/', $vs['url_complete']);
$target_arr = explode('/', $new_source);
foreach ($css_source as $vcs) {
$vcs_arr = parse_url($vcs);
if (isset($vcs_arr['domain'])) {
//不是相对路径,不下载
continue;
}
$vcs = $vcs_arr['path'] ?? '';
if (!$vcs) {
continue;
}
$source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
if ($source_info) {
//已存在,不下载
continue;
}
$url_arr[count($url_arr) - 1] = $vcs;
$url_css_complete = implode('/', $url_arr);
$target_arr[count($target_arr) - 1] = $vcs;
$path = implode('/', $target_arr);
$new_source_css = CosService::uploadRemote($project_id, 'source', $url_css_complete, $path);
if ($new_source_css) {
CollectSource::insert([
'project_id' => $project_id,
'origin' => $vcs,
'target' => $new_source_css,
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
}
}
}
}
} else {
$html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
}
}
return $html;
}
}
... ...
... ... @@ -14,7 +14,8 @@ class UpdateLog extends Model
const STATUS_COM = 2;//导入完成
const COLLECT_STATUS_UN = 0;//未开始
const COLLECT_STATUS_COM = 1;//采集完成
const COLLECT_STATUS_COM = 1;//全站小语种采集完成
const COLLECT_STATUS_MAIN = 2;//英语主站采集完成
/**
* 创建更新日志
... ...