Body.php 12.1 KB

原文件审查历史永久链接

<?php

namespace Lib\Imap\Parse;

use Lib\Imap\DataArray;
use Lib\Imap\Fun;

/**
 * 邮件内容
 * @author：dc
 * @time 2024/9/20 14:57
 * Class Body
 * @package Lib\Imap\Parse
 */
class Body {

    /**
     * 原始数据
     * @var string
     */
    protected string $raw = '';

    /**
     * 消息结构,解析后的邮件体 header
     * @var Header
     */
    protected Header $header;


    /**
     * 解析后的body数据
     * @var DataArray[]
     */
    private array $items = [];

    /**
     * Body constructor.
     * @param string $result
     * @param Header $header
     */
    public function __construct(string $result, Header $header)
    {
        $this->raw = $result;

        $this->header = $header;


        // 是否是多段
        $boundary = $this->header->getBoundary();
        if($boundary){
            $this->parseBoundaryBody($this->raw,$boundary);
        }
        // 不是多块级
        if(stripos($this->header->get('content-type'),'text/')===0){
            $this->parseRawHtml($this->raw);
        }


    }


    /**
     * 解析块内容
     * @param string $body
     * @param string $boundary
     * @author：dc
     * @time 2024/9/21 15:49
     */
    private function parseBoundaryBody(string $body, string $boundary){
        // 切割成块 boundary 的结束符号 前后都会多2个--
        // 为什么要加 \r\n在签名进行切割呢 是因为 boundary分割符只会占单独的一行，
        //有一些服务商会把header头一起返回，header里面包含了这个符号，
        //如果不加\r\n 就会导致 切割异常
        $items = explode("\r\n".$boundary,
            str_replace(['--'.$boundary.'--', $boundary.'--','--'.$boundary],$boundary,"\r\n".$body)
        );
        // 第一个块和最后一块 是没用的块
        array_shift($items);array_pop($items);
        foreach ($items as $item){
            $this->parseItem($item);
        }
    }

    /**
     * 解析 不是多块级的邮件体 直接就是html或者text的
     * @author：dc
     * @time 2024/9/21 11:37
     */
    private function parseRawHtml(string $raw) {

        $data = $this->parseMimeHeader('Content-Type: '.$this->header->get('content-type'));

        // 设置编码规则
        if($this->header->get('Content-Transfer-Encoding')){
            $data->set('Content-Transfer-Encoding',$this->header->get('Content-Transfer-Encoding'));
        }

        // 是否是item fetch
        if(preg_match('/^\* \d+ FETCH \(/',$raw)){
            $body = mb_substr(trim($raw),strpos($this->raw,'(')+1,-1);
            // 打散成数组
            $body = explode("\r\n",trim($body));
            /***************** start 第一行处理 **************/
            // 第一行 UID 1568602721 RFC822.TEXT {589}
            $first = explode(' ',$body[0]);
            $rfc822 = true;
            while (1){
                $tempstr = array_shift($first);
                if(str_starts_with($tempstr, 'RFC822')) {
                    // 是否是text 还是全部的包含header
                    if(str_contains($tempstr, '.')){
                        $rfc822 = false;
                    }
                    break;
                }
            }
            $first = array_values($first);
            $first[0] = preg_replace("/^\{\d+\}/",'',$first[0]);
            // 第一行的结果就是 踢出 UID 1568602721 RFC822.TEXT {589}
            $body[0] = implode("\r\n",$first);
            /***************** end 第一行处理 **************/
//            -----------------------------------------------------------
            /***************** start 最后一行处理 **************/
            // 最后一行可能是 UID 1568602721 微软的就是
            $end  = trim(end($body));
            $end = preg_replace("/(UID \d+)|(FLAGS \([\\a-z* ]*\))/",'',$end);
            if(!trim($end)){ array_pop($body); }
            /***************** end 最后一行处理 **************/

            // 再次组装成字符串
            $data->body = trim(implode("\r\n",$body));
            // 是否包含header
            if($rfc822){
                // 删除header头信息这个是多余的
                $data->body = explode("\r\n\r\n",$data->body,2)[1];
            }

            $this->items[] = $this->bodyDeCode($data);

        }

    }


    /**
     * 解析每个 块
     * @param string $body 块字符串
     * @return DataArray
     * @author：dc
     * @time 2024/9/21 9:51
     */
    protected function parseItem(string $body) {
        list($mime_header,$text) = explode("\r\n\r\n",trim($body)."\r\n\r\n",2);
        $text = trim($text);
        // 解析头部
        $data = $this->parseMimeHeader($mime_header);

        // 是否嵌套块级
        if($data->boundary){
            // 有些邮件带有附件 就嵌套
            $this->parseBoundaryBody($text,$data->boundary);
        }else{

            // content-type = Application/Octet-stream 因为它告诉浏览器不要尝试解释文件内容，而是将其作为二进制数据下载到用户的计算机上。这在处理未知文件类型或者需要强制下载文件时非常有用。
            // 这个里面好像是原始的邮件 mime内容，所谓的原始邮件体
            // 当附件显示即可
//            text/x-amp-html 这个是html的一种 加速移动页面  感觉有点像是手机页面
//            message/delivery-status 投递状态

            $data->body = $text;
            $this->items[] = $this->bodyDeCode($data);
        }

    }

    /**
     * @param DataArray $data
     * @return DataArray
     * @author：dc
     * @time 2024/9/21 11:39
     */
    private function bodyDeCode(DataArray $data):DataArray {
        // 处理body体 的编码
        switch (strtolower($data->get('Content-Transfer-Encoding'))){
            case 'quoted-printable':{
                $data->body = quoted_printable_decode($data->body);break;
            }
            case 'base64':{
                $data->body = base64_decode($data->body);break;
            }
            case '8bit':{
                $data->body = mb_decode_mimeheader($data->body);
                break;
            }
            case '7bit':{
                // 不需要怎么解码
            }
            case 'binary':{
                // 二进制码
                break;
            }
        }
        return $data;
    }


    /**
     * 处理子级 字段 类型
     * Content-Type: text/html;charset=utf-8
     * Content-Disposition: attachment;
    filename="=?UTF-8?B?6LaF6L+5VlPot6jlooPmkJwg5pWw5o2u5a+55q+ULnBkZg==?="
     * @param DataArray $data
     * @param $key
     * @author：dc
     * @time 2024/9/21 16:50
     */
    private function parseMimeHeaderChild(DataArray $data,$key){
        if($data->get($key)){
            // 切割成 每个小块 Content-Type: text/html;charset=utf-8
            $contentType = explode(';', trim($data->get($key)));
            foreach ($contentType as $ct){
                $ct = trim($ct);
                if (str_contains($ct,'=')){
                    // 这里包含了其他
                    list($name,$val) = explode('=',$ct,2);
                    $data->set($name,DeCode::decode(str_replace('"','',$val)));
                }elseif($ct){
                    $data->set($key,$ct);
                }
            }
        }
    }

    /**
     * 解析邮件体里面的每个块 头部
     * @param string $header
     * @return DataArray
     * @author：dc
     * @time 2024/9/21 9:18
     */
    protected function parseMimeHeader(string $header):DataArray {
        // 处理 描述信息，
        $header = explode("\r\n",trim($header));
        $data = new DataArray();
        $name = '';
        foreach ($header as $head){
            // 判断是否是上一行的
            if(str_starts_with($head,' ') || str_starts_with($head,"\t")){
                $data->set($name,' '.$head,true);
            }else{
                list($name,$value) = explode(":",$head,2);
                $data->set($name,trim((string) ($value===null?'':$value)));
            }
        }

        // 临时一个函数

        // 处理编码 Content-Type: text/html;charset=utf-8
        $this->parseMimeHeaderChild($data,'Content-Type');

        // 处理描述 Content-Disposition: attachment; filename="=?UTF-8?B?6LaF6L+5VlPot6jlooPmkJwg5pWw5o2u5a+55q+ULnBkZg==?="
        $this->parseMimeHeaderChild($data,'Content-Disposition');

        // 默认编码
        if(!$data->Charset)
        {
            $data->Charset = $this->header->get('charset') ? : 'utf-8';
        }
        // 处理content id中意外的字符串
        $data->set('Content-ID',trim(str_replace(['"','<','>'],'',$data->get('Content-ID'))));

        return $data;
    }


    /**
     * 读取纯文本的内容
     * @author：dc
     * @time 2024/9/21 9:55
     */
    public function getText():string {
        return $this->getHtmlOrText('text/plain') ? : strip_tags($this->getHtml());
    }

    /**
     * 获取指定类型的 数据
     * @param string $t
     * @return mixed|string
     * @author：dc
     * @time 2024/9/21 17:52
     */
    private function getHtmlOrText(string $t){
        foreach ($this->items as $item){
            if($item->eq('content-type',$t)){
                // 有的附件是txt的，要过滤
                if($t == 'text/plain' && $item->eq('Content-Disposition','attachment')){
                    continue;
                }
                if($item->eq('charset','utf-8')){
                    return $item->body;
                }else{
                    return Fun::mb_convert_encoding($item->body,'utf-8',$item->charset);
                }
            }
        }
        return '';
    }

    /**
     * 读取 html文本
     * @return string
     * @author：dc
     * @time 2024/9/21 10:02
     */
    public function getHtml():string {
        return $this->getHtmlOrText('text/html');
    }

    /**
     * 这个函数的主要目的 组装哪些以附件发送的图片并且需要显示在页面上的
     * 有些邮件里面的图片是通过附件的形式发来的
     * <img src="cid:xxxx" /> 这种就是附件图片，需要替换的
     * @return array
     * @author：dc
     * @time 2024/9/21 10:53
     */
    public function getHtmlAndImg():array {
        $html = $this->getHtml();
        $attachment = $this->getAttachment(true);
        foreach ($attachment as $item){
            // 替换图片路径
            $html = preg_replace(
                "/['|\"]cid:".$item->getContentId()."['|\"]/i",
                '"data:'.$item->getFileType().';base64,'.base64_encode($item->getContent()).'"',
                $html
            );
        }

        return $html;
    }

    /**
     * 读取附件 目前有2中
     * 1是 attachment 附件 就是文件
     * 2是 inline 是嵌套在html代码中的，一半情况只有图片才会这样做
     * @param bool|null $inline 是否是读取嵌入html中的图片或者其他，一半情况 图片路径以 cid:xxx
     * @return Attachment[]
     * @author：dc
     * @time 2024/9/23 10:23
     */
    public function getAttachment(bool|null $inline = null):array {
        $attachment = [];
        foreach ($this->items as $item){
            // 有的邮箱服务器 不带inline，就只有通过content-id来识别了
            $isAttachment = $item->eq('Content-Disposition','attachment');
            if($inline === null){
                if($isAttachment || $item->get('Content-ID')){
                    $attachment[] = new Attachment($item);
                }
            }else{
                if((!$inline && $isAttachment) || ($inline && $item->get('Content-ID'))){
                    $attachment[] = new Attachment($item);
                }
            }
        }

        return array_filter($attachment,function (Attachment $atta){
            if($atta->getFilename()){
                return true;
            }
            return false;
        });
    }


    /**
     * 获取原始数据
     * @return string
     */
    public function getRaw(): string
    {
        return $this->raw;
    }


    /**
     * @return DataArray[]
     */
    public function getItems(): array
    {
        return $this->items;
    }

}