作者 刘锟

update

... ... @@ -136,7 +136,7 @@ class HtmlCollect extends Command
}
$update_log = UpdateLog::where('project_id', '<', 799)->where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
$update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
if (!$update_log) {
return false;
}
... ...
<?php
namespace App\Console\Commands\Update;
use App\Models\Collect\CollectSource;
use App\Models\Collect\CollectTask;
use App\Models\Com\UpdateLog;
use App\Models\Com\UpdateOldInfo;
use App\Models\RouteMap\RouteMap;
use App\Services\CosService;
use App\Services\ProjectServer;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Redis;
/**
* 4.0,5.0升级到6.0,主站页面采集
* Class ProjectImport
* @package App\Console\Commands
* @author Akun
* @date 2023/11/10 16:04
*/
class HtmlCollectNew extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'project_html_collect_new';
/**
* The console command description.
*
* @var string
*/
protected $description = '执行项目html页面采集';
public function handle()
{
ini_set('memory_limit', '512M');
while (true) {
$this->start_collect();
}
}
protected function start_collect()
{
$task_id = $this->get_task();
if ($task_id === false) {
//所有项目采集完成
sleep(60);
return true;
} elseif ($task_id === 0) {
//当前项目采集完成
sleep(2);
return true;
}
$task_arr = explode('_', $task_id);
$project_id = $task_arr[0];
$collect_id = $task_arr[1];
//设置数据库
$project = ProjectServer::useProject($project_id);
if ($project) {
$collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->where('language', '')->first();
if (!$collect_info) {
sleep(2);
return true;
}
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
$collect_info->status = CollectTask::STATUS_ING;
$collect_info->save();
//获取站点原始域名信息
$old_info = UpdateOldInfo::getOldDomain($project_id, $collect_info->domain);
//采集html页面,下载资源到本地并替换
try {
$html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
if ($html == '0') {
$collect_info->status = CollectTask::STATUS_FAIL;
$collect_info->save();
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: no html' . PHP_EOL;
sleep(2);
return true;
}
//如果有base64图片,先替换掉,再进行资源匹配
$new_html = $html;
preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
$img_base64 = $result_img[2] ?? [];
foreach ($img_base64 as $v64) {
$new_html = str_replace($v64, '', $new_html);
}
$source_list = $this->html_preg($new_html, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
if ($source_list) {
$html = $this->upload_source($html, $source_list, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
}
} catch (\Exception $e) {
$collect_info->status = CollectTask::STATUS_FAIL;
$collect_info->save();
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
sleep(2);
return true;
}
$collect_info->html = $html;
$collect_info->status = CollectTask::STATUS_COM;
$collect_info->save();
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
}
//关闭数据库
DB::disconnect('custom_mysql');
sleep(2);
return true;
}
//获取任务
protected function get_task()
{
$key = 'console_html_collect_new_task';
$task_id = Redis::rpop($key);
if ($task_id) {
return $task_id;
}
$update_log = UpdateLog::where('project_id', '>=', 799)->where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
if (!$update_log) {
return false;
}
switch ($update_log->api_type) {
case 'page':
$source = RouteMap::SOURCE_PAGE;
break;
case 'news':
$source = RouteMap::SOURCE_NEWS;
break;
case 'blog':
$source = RouteMap::SOURCE_BLOG;
break;
default:
$source = RouteMap::SOURCE_PRODUCT;
break;
}
$complete = false;
//设置数据库
$project = ProjectServer::useProject($update_log->project_id);
if ($project) {
$collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '')->where('status', CollectTask::STATUS_UN)->orderBy('id', 'asc')->limit(50)->get();
if ($collect_list->count() == 0) {
$complete = true;
} else {
foreach ($collect_list as $collect) {
Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
}
}
}
//关闭数据库
DB::disconnect('custom_mysql');
if ($complete) {
$update_log->collect_status = UpdateLog::COLLECT_STATUS_MAIN;
$update_log->save();
return 0;
}
$task_id = Redis::rpop($key);
return $task_id;
}
//正则匹配html资源
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
{
$source = [];
if (!$html) {
return $source;
}
//image
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
$img = $result_img[2] ?? [];
foreach ($img as $vi) {
$check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
$check_vi && $source[] = $check_vi;
}
//js
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
$js = $result_js[2] ?? [];
foreach ($js as $vj) {
$check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
$check_vj && $source[] = $check_vj;
}
//video
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
$video = $result_video[2] ?? [];
foreach ($video as $vv) {
$check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
$check_vv && $source[] = $check_vv;
}
//css
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
$css = $result_css[2] ?? [];
foreach ($css as $vc) {
$check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
$check_vc && $source[] = $check_vc;
}
//css background
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
$css_b = $result_css_b[1] ?? [];
foreach ($css_b as $vc_b) {
$check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
$check_vc_b && $source[] = $check_vc_b;
}
//a标签下载资源
preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
$down = $result_a[2] ?? [];
foreach ($down as $vd) {
$check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
$check_vd && $source[] = $check_vd;
}
return $source;
}
//判断资源是否需要下载
protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
{
if ($url) {
$url = str_replace('&quot;', '', $url);
$arr = parse_url($url);
$scheme = $arr['scheme'] ?? '';
$host = $arr['host'] ?? '';
$path = $arr['path'] ?? '';
$query = $arr['query'] ?? '';
$path_arr = explode('.', $path);
$path_end = end($path_arr);
if (
(empty($scheme) || $scheme == 'https' || $scheme == 'http')
&& (empty($host) || (strpos($web_url_domain, $host) !== false) || (strpos($home_url, $host) !== false))
&& $path
&& (substr($path, 0, 1) == '/')
&& (strpos($path, '.') !== false)
&& (strpos($path_end, 'html') === false)
&& (strpos($path_end, 'php') === false)
&& (strpos($path_end, 'com') === false)
&& (strpos($path_end, 'xml') === false)
) {
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
if (!$source) {
return [
'download' => true,
'url' => $url,
'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
];
} else {
return [
'download' => false,
'url' => $url,
'url_complete' => $source['target']
];
}
} else {
return false;
}
} else {
return false;
}
}
//下载并替换资源
protected function upload_source($html, $source, $project_id, $domain, $web_url_domain, $home_url)
{
foreach ($source as $vs) {
if ($vs['download']) {
$new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
if ($new_source) {
CollectSource::insert([
'project_id' => $project_id,
'origin' => $vs['url'],
'target' => $new_source,
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
$html = str_replace($vs['url'], getImageUrl($new_source), $html);
if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {
$source_html = curl_c(getImageUrl($new_source), false);
if (substr($new_source, -3, 3) == 'css') {
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);
} else {
preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
}
$js_css_source = $result_source[1] ?? [];
if ($js_css_source) {
foreach ($js_css_source as $vjs) {
if (strpos($vjs, 'URL:"') !== false) {
$vjs = substr($vjs, strpos($vjs, 'URL:"') + 5);
}
$vjs_down = str_replace('&quot;', '', $vjs);
if (strpos($vjs_down, 'data:') !== false) {
//过滤二进制文件
continue;
}
if (strlen($vjs_down) > 255) {
//过滤太长文件
continue;
}
$vjs_down_arr = parse_url($vjs_down);
$vjs_down_host = $vjs_down_arr['host'] ?? '';
$cos = config('filesystems.disks.cos');
$cosCdn = $cos['cdn'];
if ($vjs_down_host && $vjs_down_host == $cosCdn) {
//过滤已经下载的
continue;
}
if (empty($vjs_down_host) && substr($vjs_down, 0, 1) != '/') {
//相对路径
$url_arr = explode('/', $vs['url']);
$url_arr[count($url_arr) - 1] = $vjs_down;
$vjs_down = implode('/', $url_arr);
}
$vjs_result = $this->url_check($vjs_down, $project_id, $domain, $web_url_domain, $home_url);
if (!$vjs_result) {
continue;
}
if ($vjs_result['download']) {
$new_vjs = CosService::uploadRemote($project_id, 'source', $vjs_result['url_complete']);
if ($new_vjs) {
CollectSource::insert([
'project_id' => $project_id,
'origin' => $vjs_result['url'],
'target' => $new_vjs,
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
$source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);
}
} else {
$source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);
}
}
CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $source_html);
}
}
}
} else {
$html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
}
}
return $html;
}
}
<?php
namespace App\Console\Commands\Update;
use App\Models\Com\UpdateVisit;
use App\Models\Visit\Visit;
use App\Models\Visit\VisitItem;
use App\Services\ProjectServer;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Redis;
/**
* 4.0,5.0升级到6.0,访问同步
* Class ProjectImport
* @package App\Console\Commands
* @author Akun
* @date 2023/12/18 15:52
*/
class ProjectVisit extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'project_visit';
/**
* The console command description.
*
* @var string
*/
protected $description = '执行项目升级访问任务';
public function handle()
{
ini_set('memory_limit', '512M');
while (true) {
$this->start_visit();
}
}
protected function start_visit()
{
$task_id = $this->get_task();
if (!$task_id) {
sleep(60);
return true;
}
$task = UpdateVisit::where('id', $task_id)->where('status', UpdateVisit::STATUS_UN)->first();
if (!$task) {
sleep(2);
return true;
}
$project_id = $task->project_id;
$api_type = $task->api_type;
$api_url = $task->api_url;
$page_size = 200;
echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', task_type: ' . $api_type . ', update start' . PHP_EOL;
$task->status = UpdateVisit::STATUS_ING;//同步中
$task->save();
//设置数据库
$project = ProjectServer::useProject($project_id);
if ($project) {
if ($api_type == 'visit_list') {
//访问列表
$url = $api_url . '?' . http_build_query(['w' => 'visit_list', 'page' => 1, 'pagesize' => 1]);
$data = curl_c($url);
if (isset($data['count']) && $data['count'] > 0) {
$count = $data['count'];
$total_page = ceil($count / $page_size);
for ($page = 1; $page <= $total_page; $page++) {
$url_page = $api_url . '?' . http_build_query(['w' => 'visit_list', 'page' => $page, 'pagesize' => $page_size]);
$data_page = curl_c($url_page);
if (isset($data_page['data']) && $data_page['data']) {
$items = $data_page['data'];
$model = new Visit();
foreach ($items as $item) {
if (isset($item['id']) && $item['id']) {
$visit = $model->read(['original_id' => $item['id']], 'id');
if (!$visit) {
try {
$url_arr = parse_url($item['request'] ?? '');
$model->insert([
'url' => $item['request'] ?? '',
'referrer_url' => $item['referrer'] ?? '',
'device_port' => isset($item['is_moblie']) && $item['is_moblie'] == 1 ? 2 : 1,
'country' => $item['ip_area'] ?? '',
'ip' => $item['ip'] ?? '',
'depth' => $item['pv'],
'domain' => $url_arr['host'] ?? '',
'created_at' => date('Y-m-d H:i:s', isset($item['update']) && $item['update'] ? $item['update'] : time()),
'updated_at' => date('Y-m-d H:i:s', isset($item['update']) && $item['update'] ? $item['update'] : time()),
'updated_date' => date('Y-m-d', isset($item['c_time']) && $item['c_time'] ? strtotime($item['c_time']) : time()),
'original_id' => $item['id'],
]);
} catch (\Exception $e) {
echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
continue;
}
}
}
}
}
}
} else {
return true;
}
} else {
//访问明细
$url = $api_url . '?' . http_build_query(['w' => 'visit_detail_list', 'page' => 1, 'pagesize' => 1]);
$data = curl_c($url);
if (isset($data['count']) && $data['count'] > 0) {
$count = $data['count'];
$total_page = ceil($count / $page_size);
for ($page = 1; $page <= $total_page; $page++) {
$url_page = $api_url . '?' . http_build_query(['w' => 'visit_detail_list', 'page' => $page, 'pagesize' => $page_size]);
$data_page = curl_c($url_page);
if (isset($data_page['data']) && $data_page['data']) {
$items = $data_page['data'];
$model = new VisitItem();
$p_model = new Visit();
foreach ($items as $item) {
if (isset($item['id']) && $item['id']) {
$visit = $model->read(['original_id' => $item['id']], 'id');
if (!$visit) {
try {
$p_info = $p_model->read(['ip' => $item['ip'] ?? '', 'updated_date' => $item['day_at'] ?? '']);
if ($p_info) {
$model->insert([
'customer_visit_id' => $p_info['id'],
'url' => $p_info['url'],
'referrer_url' => $p_info['referrer_url'],
'device_port' => $p_info['device_port'],
'country' => $p_info['country'],
'ip' => $p_info['ip'],
'domain' => $p_info['domain'],
'created_at' => $item['time_str'] ?? $p_info['created_at'],
'updated_at' => $item['time_str'] ?? $p_info['updated_at'],
'updated_date' => $item['day_at'] ?? $p_info['updated_date'],
'original_id' => $item['id'],
]);
}
} catch (\Exception $e) {
echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
continue;
}
}
}
}
}
}
} else {
return true;
}
}
}
//关闭数据库
DB::disconnect('custom_mysql');
$task->status = UpdateVisit::STATUS_COM;//同步完成
$task->save();
echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', task_type: ' . $api_type . ', update end ' . PHP_EOL;
sleep(2);
}
//获取任务
protected function get_task()
{
$key = 'console_visit_task';
$task_id = Redis::rpop($key);
if ($task_id) {
return $task_id;
}
$task_list = UpdateVisit::where('status', UpdateVisit::STATUS_UN)->orderBy('sort', 'asc')->orderBy('project_id', 'asc')->limit(20)->get();
if ($task_list->count() == 0) {
return false;
}
foreach ($task_list as $value) {
Redis::lpush($key, $value->id);
}
$task_id = Redis::rpop($key);
return $task_id;
}
}
... ...
... ... @@ -29,12 +29,21 @@ class ImportLogic extends BaseLogic
if (end($ext) != 'csv') {
$this->fail('导入文件格式必须为csv');
}
$domain = $this->param['domain'];
if (strpos($domain, 'https') === false || strpos($domain, 'http') == false) {
$this->fail('请输入完整的采集页面地址');
}
$domain_arr = parse_url($domain);
if (!isset($domain_arr['host'])) {
$this->fail('采集页面地址输入有误');
}
$this->param['domain'] = $domain_arr['host'];
$this->param['project_id'] = $this->user['project_id'];
$this->param['user_id'] = $this->user['id'];
$this->param['status'] = 9;
$rs = $this->model->add($this->param);
if($rs === false){
if ($rs === false) {
$this->fail('error');
}
return $this->success();
... ...
... ... @@ -35,6 +35,7 @@ class ImportTaskRequest extends FormRequest
return [
'type' => ['required', Rule::in([ImportTask::TYPE_PROJECT, ImportTask::TYPE_NEWS, ImportTask::TYPE_BLOG])],
'file_url' => ['required'],
'domain' => ['required'],
];
}
... ... @@ -44,6 +45,7 @@ class ImportTaskRequest extends FormRequest
'type.required' => '导入类型必须',
'type.in' => '导入类型错误',
'file_url.required' => '文件地址必须',
'domain.required' => '采集页面地址必须填写',
];
}
}
... ...
<?php
namespace App\Models\Com;
use Illuminate\Database\Eloquent\Model;
class UpdateVisit extends Model
{
//设置关联表名
protected $table = 'gl_update_visit';
const STATUS_UN = 0;//未开始
const STATUS_ING = 1;//导入中
const STATUS_COM = 2;//导入完成
}
... ...