|
...
|
...
|
@@ -38,14 +38,13 @@ class HtmlCollect extends Command |
|
|
|
public function handle()
|
|
|
|
{
|
|
|
|
// while (true) {
|
|
|
|
$this->start_update();
|
|
|
|
$this->start_collect();
|
|
|
|
// }
|
|
|
|
}
|
|
|
|
|
|
|
|
protected function start_update()
|
|
|
|
protected function start_collect()
|
|
|
|
{
|
|
|
|
// $task_id = $this->get_task();
|
|
|
|
$task_id = '298_1';
|
|
|
|
$task_id = $this->get_task();
|
|
|
|
if ($task_id === false) {
|
|
|
|
//所有项目采集完成
|
|
|
|
sleep(60);
|
|
...
|
...
|
@@ -70,7 +69,7 @@ class HtmlCollect extends Command |
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect start' . PHP_EOL;
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
|
|
|
|
|
|
|
|
$collect_info->status = CollectTask::STATUS_ING;
|
|
|
|
$collect_info->save();
|
|
...
|
...
|
@@ -84,7 +83,7 @@ class HtmlCollect extends Command |
|
|
|
$html = $this->upload_source($html, $source_list, $project_id);
|
|
|
|
}
|
|
|
|
} catch (\Exception $e) {
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
...
|
...
|
@@ -92,7 +91,7 @@ class HtmlCollect extends Command |
|
|
|
$collect_info->status = CollectTask::STATUS_COM;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect end' . PHP_EOL;
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
|
|
|
|
}
|
|
|
|
//关闭数据库
|
|
|
|
DB::disconnect('custom_mysql');
|
|
...
|
...
|
@@ -150,7 +149,7 @@ class HtmlCollect extends Command |
|
|
|
return $source;
|
|
|
|
}
|
|
|
|
|
|
|
|
//图片
|
|
|
|
//image
|
|
|
|
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
|
|
|
$img = $result_img[2] ?? [];
|
|
|
|
foreach ($img as $vi) {
|
|
...
|
...
|
@@ -182,6 +181,15 @@ class HtmlCollect extends Command |
|
|
|
$check_vc && $source[] = $check_vc;
|
|
|
|
}
|
|
|
|
|
|
|
|
//css background
|
|
|
|
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
|
|
|
|
$css_b = $result_css_b[1] ?? [];
|
|
|
|
foreach ($css_b as $vc_b) {
|
|
|
|
$check_vc_b = $this->url_check($vc_b, $project_id, $domain);
|
|
|
|
$check_vc_b && $source[] = $check_vc_b;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return $source;
|
|
|
|
}
|
|
|
|
|
|
...
|
...
|
@@ -194,9 +202,11 @@ class HtmlCollect extends Command |
|
|
|
$host = $arr['host'] ?? '';
|
|
|
|
$path = $arr['path'] ?? '';
|
|
|
|
|
|
|
|
if ((strpos($host, '.globalso.') === false)
|
|
|
|
&& (strpos($host, '.goodao.') === false)
|
|
|
|
&& $path && (strpos($path, '.') !== false)) {
|
|
|
|
if (
|
|
|
|
(strpos($host, '.globalso.') === false) &&
|
|
|
|
(strpos($host, '.goodao.') === false) &&
|
|
|
|
$path && (strpos($path, '.') !== false)
|
|
|
|
) {
|
|
|
|
|
|
|
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
|
|
if (!$source) {
|
|
...
|
...
|
@@ -236,6 +246,50 @@ class HtmlCollect extends Command |
|
|
|
'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
]);
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($new_source), $html);
|
|
|
|
|
|
|
|
if (substr($new_source, -3, 3) == 'css') {
|
|
|
|
// 下载css文件中的资源
|
|
|
|
$css_html = file_get_contents($vs['url_complete']);
|
|
|
|
preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
|
|
|
|
$css_source = $result_css_source[1] ?? [];
|
|
|
|
|
|
|
|
$url_arr = explode('/', $vs['url_complete']);
|
|
|
|
$target_arr = explode('/', $new_source);
|
|
|
|
foreach ($css_source as $vcs) {
|
|
|
|
$vcs_arr = parse_url($vcs);
|
|
|
|
if (isset($vcs_arr['domain'])) {
|
|
|
|
//不是相对路径,不下载
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$vcs = $vcs_arr['path'] ?? '';
|
|
|
|
if (!$vcs) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
|
|
|
|
if ($source_info) {
|
|
|
|
//已存在,不下载
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$url_arr[count($url_arr) - 1] = $vcs;
|
|
|
|
$url_css_complete = implode('/', $url_arr);
|
|
|
|
$target_arr[count($target_arr) - 1] = $vcs;
|
|
|
|
$path = implode('/', $target_arr);
|
|
|
|
|
|
|
|
$new_source_css = CosService::uploadRemote($project_id, 'source', $url_css_complete, $path);
|
|
|
|
if ($new_source_css) {
|
|
|
|
CollectSource::insert([
|
|
|
|
'project_id' => $project_id,
|
|
|
|
'origin' => $vcs,
|
|
|
|
'target' => $new_source_css,
|
|
|
|
'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
|
...
|
...
|
|