|
...
|
...
|
@@ -53,6 +53,7 @@ class HtmlCollect extends Command |
|
|
|
|
|
|
|
protected function start_collect()
|
|
|
|
{
|
|
|
|
$tdk_project_ids = [714];
|
|
|
|
$task_id = $this->get_task();
|
|
|
|
if ($task_id === false) {
|
|
|
|
//所有项目采集完成
|
|
...
|
...
|
@@ -108,7 +109,9 @@ class HtmlCollect extends Command |
|
|
|
}
|
|
|
|
|
|
|
|
//提取页面tdk
|
|
|
|
if(in_array($project_id,$tdk_project_ids)){
|
|
|
|
$this->get_site_meta($new_html, $collect_info);
|
|
|
|
}
|
|
|
|
|
|
|
|
$source_list = $this->html_preg($new_html, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
|
|
|
|
|
|
...
|
...
|
@@ -205,13 +208,13 @@ class HtmlCollect extends Command |
|
|
|
#Title
|
|
|
|
preg_match_all('/<title>([\w\W]*?)<\/title>/', $html, $matches);
|
|
|
|
if (!empty($matches[1])) {
|
|
|
|
$meta['title'] = substr($matches[1][0], 0, 70);
|
|
|
|
$meta['title'] = substr($matches[1][0], 0, 255);
|
|
|
|
}
|
|
|
|
|
|
|
|
#Keywords
|
|
|
|
preg_match_all('/<meta\s+[^>]*?name=[\'|\"]keywords[\'|\"]\s+[^>]*?content=[\'|\"]([\w\W]*?)[\'|\"]/', $html, $matches);
|
|
|
|
if (!empty($matches[1])) {
|
|
|
|
$meta['keyword'] = substr($matches[1][0], 0, 200);
|
|
|
|
$meta['keyword'] = substr($matches[1][0], 0, 255);
|
|
|
|
}
|
|
|
|
|
|
|
|
#Description
|
...
|
...
|
|