作者 刘锟

update

... ... @@ -39,14 +39,15 @@ class HtmlCollect extends Command
public function handle()
{
while (true) {
// while (true) {
$this->start_collect();
}
// }
}
protected function start_collect()
{
$task_id = $this->get_task();
// $task_id = $this->get_task();
$task_id = '595_41517';
if ($task_id === false) {
//所有项目采集完成
sleep(60);
... ... @@ -336,62 +337,25 @@ class HtmlCollect extends Command
]);
$html = str_replace($vs['url'], getImageUrl($new_source), $html);
if (substr($new_source, -3, 3) == 'css') {
// 下载css文件中的资源
$css_html = curl_c($vs['url_complete'], false);
preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
$css_source = $result_css_source[1] ?? [];
$url_arr = explode('/', $vs['url_complete']);
$target_arr = explode('/', $new_source);
foreach ($css_source as $vcs) {
$vcs = str_replace('"', '', $vcs);
$vcs_arr = parse_url($vcs);
if (isset($vcs_arr['domain'])) {
//不是相对路径,不下载
continue;
}
if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {
$vcs = $vcs_arr['path'] ?? '';
if (!$vcs) {
continue;
}
if (strpos($vcs, '.') === false) {
continue;
}
$path_arr = explode('.', $vcs);
if (in_array(end($path_arr), ['html', 'php', 'com', 'xml'])) {
continue;
}
$source_html = curl_c(getImageUrl($new_source), false);
$source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
if ($source_info) {
//已存在,不下载
continue;
}
$url_arr[count($url_arr) - 1] = $vcs;
$url_css_complete = implode('/', $url_arr);
$target_arr[count($target_arr) - 1] = $vcs;
$path = implode('/', $target_arr);
$new_source_css = CosService::uploadRemote($project_id, 'source', $url_css_complete, $path);
if ($new_source_css) {
CollectSource::insert([
'project_id' => $project_id,
'origin' => $vcs,
'target' => $new_source_css,
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
}
if (substr($new_source, -3, 3) == 'css') {
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);
} else {
preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
}
} elseif (substr($new_source, -2, 2) == 'js') {
$js_html = curl_c(getImageUrl($new_source), false);
preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $js_html, $result_js_source);
$js_source = $result_js_source[1] ?? [];
if($js_source){
foreach ($js_source as $vjs) {
$js_css_source = $result_source[1] ?? [];
if ($js_css_source) {
foreach ($js_css_source as $vjs) {
if (substr($vjs, 0, 2) == './') {
//相对路径
$url_arr = explode('/', $vs['url']);
$url_arr[count($url_arr) - 1] = substr($vjs, 2);
$vjs = implode('/', $url_arr);
}
$vjs_result = $this->url_check($vjs, $project_id, $domain, $web_url_domain, $home_url);
if (!$vjs_result) {
continue;
... ... @@ -407,25 +371,39 @@ class HtmlCollect extends Command
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
$js_html = str_replace($vjs, getImageUrl($new_vjs), $js_html);
$source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);
}
} else {
$js_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $js_html);
$source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);
}
}
CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $js_html);
CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $source_html);
}
}
}
} else {
$html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
if(substr($vs['url_complete'], -2, 2) == 'js'){
$js_html = curl_c(getImageUrl($vs['url_complete']), false);
preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $js_html, $result_js_source);
$js_source = $result_js_source[1] ?? [];
if($js_source){
foreach ($js_source as $vjs) {
if (substr($vs['url_complete'], -3, 3) == 'css' || substr($vs['url_complete'], -2, 2) == 'js') {
$source_html = curl_c(getImageUrl($vs['url_complete']), false);
if (substr($vs['url_complete'], -3, 3) == 'css') {
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);
} else {
preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
}
$js_css_source = $result_source[1] ?? [];
if ($js_css_source) {
foreach ($js_css_source as $vjs) {
if (substr($vjs, 0, 2) == './') {
//相对路径
$url_arr = explode('/', $vs['url']);
$url_arr[count($url_arr) - 1] = substr($vjs, 2);
$vjs = implode('/', $url_arr);
}
$vjs_result = $this->url_check($vjs, $project_id, $domain, $web_url_domain, $home_url);
if (!$vjs_result) {
continue;
... ... @@ -441,14 +419,14 @@ class HtmlCollect extends Command
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
$js_html = str_replace($vjs, getImageUrl($new_vjs), $js_html);
$source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);
}
} else {
$js_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $js_html);
$source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);
}
}
CosService::uploadRemote($project_id, 'source', $vs['url_complete'], $vs['url_complete'], $js_html);
CosService::uploadRemote($project_id, 'source', $vs['url_complete'], $vs['url_complete'], $source_html);
}
}
}
... ...