|
...
|
...
|
@@ -60,8 +60,30 @@ class ProofreadingController extends BaseController |
|
|
|
* @time :2023/11/22 10:02
|
|
|
|
*/
|
|
|
|
public function getUrlRead($url){
|
|
|
|
$sourceCode = file_get_contents($url);
|
|
|
|
$strippedContent = strip_tags($sourceCode); // 删除所有HTML标签
|
|
|
|
var_dump($strippedContent);
|
|
|
|
$contextOptions = [
|
|
|
|
'ssl' => [
|
|
|
|
'verify_peer' => false,
|
|
|
|
'verify_peer_name' => false,
|
|
|
|
],
|
|
|
|
];
|
|
|
|
$context = stream_context_create($contextOptions);
|
|
|
|
$sourceCode = file_get_contents($url, false, $context);
|
|
|
|
$pattern = '/<style\b[^>]*>(.*?)<\/style>/s'; // 定义匹配`<style>`标签及其内容的正则表达式
|
|
|
|
$strippedContent = preg_replace($pattern, '', $sourceCode); // 删除`<style>`标签及其内容
|
|
|
|
$pattern = '/<script\b[^>]*>(.*?)<\/script>/s'; // 定义匹配`<script>`标签及其内容的正则表达式
|
|
|
|
$strippedContent = preg_replace($pattern, '', $strippedContent); // 删除`<script>`标签及其内容
|
|
|
|
$pattern = '/<link\b[^>]*>/'; // 定义匹配 `<link>` 标签的正则表达式
|
|
|
|
$strippedContent = preg_replace($pattern, '', $strippedContent); // 删除 `<link>` 标签
|
|
|
|
$pattern = '/<footer\b[^>]*>(.*?)<\/footer>/s'; // 定义匹配`<script>`标签及其内容的正则表达式
|
|
|
|
$strippedContent = preg_replace($pattern, '', $strippedContent); // 删除`<script>`标签及其内容
|
|
|
|
$pattern = '/>([^<]+)</'; // 定义匹配中间内容不是标签的正则表达式
|
|
|
|
$matches = array();
|
|
|
|
preg_match_all($pattern, $strippedContent, $matches);
|
|
|
|
$textContentArray = array_filter($matches[1], function($item) {
|
|
|
|
return !empty(trim($item));
|
|
|
|
});
|
|
|
|
$textContentArray = array_values($textContentArray);
|
|
|
|
$uniqueArray = array_unique($textContentArray);
|
|
|
|
return $this->success($uniqueArray);
|
|
|
|
}
|
|
|
|
} |
...
|
...
|
|