rcube_charset.php 22.6 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
<?php

/**
 +-----------------------------------------------------------------------+
 | This file is part of the Roundcube Webmail client                     |
 |                                                                       |
 | Copyright (C) The Roundcube Dev Team                                  |
 | Copyright (C) Kolab Systems AG                                        |
 | Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>            |
 |                                                                       |
 | Licensed under the GNU General Public License version 3 or            |
 | any later version with exceptions for skins & plugins.                |
 | See the README file for a full license statement.                     |
 |                                                                       |
 | PURPOSE:                                                              |
 |   Provide charset conversion functionality                            |
 +-----------------------------------------------------------------------+
 | Author: Thomas Bruederli <roundcube@gmail.com>                        |
 | Author: Aleksander Machniak <alec@alec.pl>                            |
 | Author: Edmund Grimley Evans <edmundo@rano.org>                       |
 +-----------------------------------------------------------------------+
*/

/**
 * Character sets conversion functionality
 *
 * @package    Framework
 * @subpackage Core
 */
class rcube_charset
{
    /**
     * Character set aliases (some of them from HTML5 spec.)
     *
     * @var array
     */
    static public $aliases = [
        'USASCII'       => 'WINDOWS-1252',
        'ANSIX31101983' => 'WINDOWS-1252',
        'ANSIX341968'   => 'WINDOWS-1252',
        'UNKNOWN8BIT'   => 'ISO-8859-15',
        'UNKNOWN'       => 'ISO-8859-15',
        'USERDEFINED'   => 'ISO-8859-15',
        'KSC56011987'   => 'EUC-KR',
        'GB2312'        => 'GBK',
        'GB231280'      => 'GBK',
        'UNICODE'       => 'UTF-8',
        'UTF7IMAP'      => 'UTF7-IMAP',
        'TIS620'        => 'WINDOWS-874',
        'ISO88599'      => 'WINDOWS-1254',
        'ISO885911'     => 'WINDOWS-874',
        'MACROMAN'      => 'MACINTOSH',
        '77'            => 'MAC',
        '128'           => 'SHIFT-JIS',
        '129'           => 'CP949',
        '130'           => 'CP1361',
        '134'           => 'GBK',
        '136'           => 'BIG5',
        '161'           => 'WINDOWS-1253',
        '162'           => 'WINDOWS-1254',
        '163'           => 'WINDOWS-1258',
        '177'           => 'WINDOWS-1255',
        '178'           => 'WINDOWS-1256',
        '186'           => 'WINDOWS-1257',
        '204'           => 'WINDOWS-1251',
        '222'           => 'WINDOWS-874',
        '238'           => 'WINDOWS-1250',
        'MS950'         => 'CP950',
        'WINDOWS949'    => 'UHC',
        'WINDOWS1257'   => 'ISO-8859-13',
        'ISO2022JP'     => 'ISO-2022-JP-MS',
    ];

    /**
     * Windows codepages
     *
     * @var array
     */
    static public $windows_codepages = [
         37 => 'IBM037',    // IBM EBCDIC US-Canada
        437 => 'IBM437',    // OEM United States
        500 => 'IBM500',    // IBM EBCDIC International
        708 => 'ASMO-708',  // Arabic (ASMO 708)
        720 => 'DOS-720',   // Arabic (Transparent ASMO); Arabic (DOS)
        737 => 'IBM737',    // OEM Greek (formerly 437G); Greek (DOS)
        775 => 'IBM775',    // OEM Baltic; Baltic (DOS)
        850 => 'IBM850',    // OEM Multilingual Latin 1; Western European (DOS)
        852 => 'IBM852',    // OEM Latin 2; Central European (DOS)
        855 => 'IBM855',    // OEM Cyrillic (primarily Russian)
        857 => 'IBM857',    // OEM Turkish; Turkish (DOS)
        858 => 'IBM00858',  // OEM Multilingual Latin 1 + Euro symbol
        860 => 'IBM860',    // OEM Portuguese; Portuguese (DOS)
        861 => 'IBM861',    // OEM Icelandic; Icelandic (DOS)
        862 => 'DOS-862',   // OEM Hebrew; Hebrew (DOS)
        863 => 'IBM863',    // OEM French Canadian; French Canadian (DOS)
        864 => 'IBM864',    // OEM Arabic; Arabic (864)
        865 => 'IBM865',    // OEM Nordic; Nordic (DOS)
        866 => 'cp866',     // OEM Russian; Cyrillic (DOS)
        869 => 'IBM869',    // OEM Modern Greek; Greek, Modern (DOS)
        870 => 'IBM870',    // IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
        874 => 'windows-874',  // ANSI/OEM Thai (ISO 8859-11); Thai (Windows)
        875 => 'cp875',     // IBM EBCDIC Greek Modern
        932 => 'shift_jis', // ANSI/OEM Japanese; Japanese (Shift-JIS)
        936 => 'gb2312',    // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
        950 => 'big5',      // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
        1026 => 'IBM1026',      // IBM EBCDIC Turkish (Latin 5)
        1047 => 'IBM01047',     // IBM EBCDIC Latin 1/Open System
        1140 => 'IBM01140',     // IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
        1141 => 'IBM01141',     // IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
        1142 => 'IBM01142',     // IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
        1143 => 'IBM01143',     // IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
        1144 => 'IBM01144',     // IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
        1145 => 'IBM01145',     // IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
        1146 => 'IBM01146',     // IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
        1147 => 'IBM01147',     // IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
        1148 => 'IBM01148',     // IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
        1149 => 'IBM01149',     // IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
        1200 => 'UTF-16',       // Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
        1201 => 'UTF-16BE',     // Unicode UTF-16, big endian byte order; available only to managed applications
        1250 => 'windows-1250', // ANSI Central European; Central European (Windows)
        1251 => 'windows-1251', // ANSI Cyrillic; Cyrillic (Windows)
        1252 => 'windows-1252', // ANSI Latin 1; Western European (Windows)
        1253 => 'windows-1253', // ANSI Greek; Greek (Windows)
        1254 => 'windows-1254', // ANSI Turkish; Turkish (Windows)
        1255 => 'windows-1255', // ANSI Hebrew; Hebrew (Windows)
        1256 => 'windows-1256', // ANSI Arabic; Arabic (Windows)
        1257 => 'windows-1257', // ANSI Baltic; Baltic (Windows)
        1258 => 'windows-1258', // ANSI/OEM Vietnamese; Vietnamese (Windows)
        10000 => 'macintosh',   // MAC Roman; Western European (Mac)
        12000 => 'UTF-32',      // Unicode UTF-32, little endian byte order; available only to managed applications
        12001 => 'UTF-32BE',    // Unicode UTF-32, big endian byte order; available only to managed applications
        20127 => 'US-ASCII',    // US-ASCII (7-bit)
        20273 => 'IBM273',      // IBM EBCDIC Germany
        20277 => 'IBM277',      // IBM EBCDIC Denmark-Norway
        20278 => 'IBM278',      // IBM EBCDIC Finland-Sweden
        20280 => 'IBM280',      // IBM EBCDIC Italy
        20284 => 'IBM284',      // IBM EBCDIC Latin America-Spain
        20285 => 'IBM285',      // IBM EBCDIC United Kingdom
        20290 => 'IBM290',      // IBM EBCDIC Japanese Katakana Extended
        20297 => 'IBM297',      // IBM EBCDIC France
        20420 => 'IBM420',      // IBM EBCDIC Arabic
        20423 => 'IBM423',      // IBM EBCDIC Greek
        20424 => 'IBM424',      // IBM EBCDIC Hebrew
        20838 => 'IBM-Thai',    // IBM EBCDIC Thai
        20866 => 'koi8-r',      // Russian (KOI8-R); Cyrillic (KOI8-R)
        20871 => 'IBM871',      // IBM EBCDIC Icelandic
        20880 => 'IBM880',      // IBM EBCDIC Cyrillic Russian
        20905 => 'IBM905',      // IBM EBCDIC Turkish
        20924 => 'IBM00924',    // IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
        20932 => 'EUC-JP',      // Japanese (JIS 0208-1990 and 0212-1990)
        20936 => 'cp20936',     // Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
        20949 => 'cp20949',     // Korean Wansung
        21025 => 'cp1025',      // IBM EBCDIC Cyrillic Serbian-Bulgarian
        21866 => 'koi8-u',      // Ukrainian (KOI8-U); Cyrillic (KOI8-U)
        28591 => 'iso-8859-1',  // ISO 8859-1 Latin 1; Western European (ISO)
        28592 => 'iso-8859-2',  // ISO 8859-2 Central European; Central European (ISO)
        28593 => 'iso-8859-3',  // ISO 8859-3 Latin 3
        28594 => 'iso-8859-4',  // ISO 8859-4 Baltic
        28595 => 'iso-8859-5',  // ISO 8859-5 Cyrillic
        28596 => 'iso-8859-6',  // ISO 8859-6 Arabic
        28597 => 'iso-8859-7',  // ISO 8859-7 Greek
        28598 => 'iso-8859-8',  // ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
        28599 => 'iso-8859-9',  // ISO 8859-9 Turkish
        28603 => 'iso-8859-13', // ISO 8859-13 Estonian
        28605 => 'iso-8859-15', // ISO 8859-15 Latin 9
        38598 => 'iso-8859-8-i', // ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
        50220 => 'iso-2022-jp', // ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
        50221 => 'csISO2022JP', // ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
        50222 => 'iso-2022-jp', // ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
        50225 => 'iso-2022-kr', // ISO 2022 Korean
        51932 => 'EUC-JP',      // EUC Japanese
        51936 => 'EUC-CN',      // EUC Simplified Chinese; Chinese Simplified (EUC)
        51949 => 'EUC-KR',      // EUC Korean
        52936 => 'hz-gb-2312',  // HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
        54936 => 'GB18030',     // Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
        65000 => 'UTF-7',
        65001 => 'UTF-8',
    ];

    /**
     * Parse and validate charset name string.
     * Sometimes charset string is malformed, there are also charset aliases,
     * but we need strict names for charset conversion (specially utf8 class)
     *
     * @param string $input Input charset name
     *
     * @return string The validated charset name
     */
    public static function parse_charset($input)
    {
        static $charsets = [];

        $charset = strtoupper($input);

        if (isset($charsets[$input])) {
            return $charsets[$input];
        }

        $charset = preg_replace([
            '/^[^0-9A-Z]+/',    // e.g. _ISO-8859-JP$SIO
            '/\$.*$/',          // e.g. _ISO-8859-JP$SIO
            '/UNICODE-1-1-*/',  // RFC1641/1642
            '/^X-/',            // X- prefix (e.g. X-ROMAN8 => ROMAN8)
            '/\*.*$/'           // lang code according to RFC 2231.5
        ], '', $charset);

        if ($charset == 'BINARY') {
            return $charsets[$input] = null;
        }

        // allow A-Z and 0-9 only
        $str = preg_replace('/[^A-Z0-9]/', '', $charset);

        $result = $charset;

        if (isset(self::$aliases[$str])) {
            $result = self::$aliases[$str];
        }
        // UTF
        else if (preg_match('/U[A-Z][A-Z](7|8|16|32)(BE|LE)*/', $str, $m)) {
            $result = 'UTF-' . $m[1] . (!empty($m[2]) ? $m[2] : '');
        }
        // ISO-8859
        else if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) {
            $iso = 'ISO-8859-' . ($m[1] ?: 1);
            // some clients sends windows-1252 text as latin1,
            // it is safe to use windows-1252 for all latin1
            $result = $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso;
        }
        // handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE
        else if (preg_match('/(WIN|WINDOWS)([0-9]+)/', $str, $m)) {
            $result = 'WINDOWS-' . $m[2];
        }
        // LATIN
        else if (preg_match('/LATIN(.*)/', $str, $m)) {
            $aliases = ['2' => 2, '3' => 3, '4' => 4, '5' => 9, '6' => 10,
                '7' => 13, '8' => 14, '9' => 15, '10' => 16,
                'ARABIC' => 6, 'CYRILLIC' => 5, 'GREEK' => 7, 'GREEK1' => 7, 'HEBREW' => 8
            ];

            // some clients sends windows-1252 text as latin1,
            // it is safe to use windows-1252 for all latin1
            if ($m[1] == 1) {
                $result = 'WINDOWS-1252';
            }
            // we need ISO labels
            else if (!empty($aliases[$m[1]])) {
                $result = 'ISO-8859-'.$aliases[$m[1]];
            }
        }

        $charsets[$input] = $result;

        return $result;
    }

    /**
     * Convert a string from one charset to another.
     *
     * @param string $str  Input string
     * @param string $from Suspected charset of the input string
     * @param string $to   Target charset to convert to; defaults to RCUBE_CHARSET
     *
     * @return string Converted string
     */
    public static function convert($str, $from, $to = null)
    {
        static $iconv_options;

        $to   = empty($to) ? RCUBE_CHARSET : self::parse_charset($to);
        $from = self::parse_charset($from);

        // It is a common case when UTF-16 charset is used with US-ASCII content (#1488654)
        // In that case we can just skip the conversion (use UTF-8)
        if ($from == 'UTF-16' && !preg_match('/[^\x00-\x7F]/', $str)) {
            $from = 'UTF-8';
        }

        if ($from == $to || empty($str) || empty($from)) {
            return $str;
        }

        $out = false;
        $error_handler = function() { throw new \Exception(); };

        // Ignore invalid characters
        $mbstring_sc = mb_substitute_character();
        mb_substitute_character('none');

        // If mbstring reports an illegal character in input via E_WARNING.
        // FIXME: Is this really true with substitute character 'none'?
        // A warning is thrown in PHP<8 also on unsupported encoding, in PHP>=8 ValueError
        // is thrown instead (therefore we catch Throwable below)
        set_error_handler($error_handler, E_WARNING);

        try {
            $out = mb_convert_encoding($str, $to, $from);
        }
        catch (Throwable $e) {
            $out = false;
        }
        catch (Exception $e) {
            $out = false;
        }

        restore_error_handler();
        mb_substitute_character($mbstring_sc);

        if ($out !== false) {
            return $out;
        }

        if ($iconv_options === null) {
            if (function_exists('iconv')) {
                // ignore characters not available in output charset
                $iconv_options = '//IGNORE';
                if (iconv('', $iconv_options, '') === false) {
                    // iconv implementation does not support options
                    $iconv_options = '';
                }
            }
            else {
                $iconv_options = false;
            }
        }

        // Fallback to iconv module, it is slower, but supports much more charsets than mbstring
        if ($iconv_options !== false && $from != 'UTF7-IMAP' && $to != 'UTF7-IMAP'
            && $from !== 'ISO-2022-JP'
        ) {
            // If iconv reports an illegal character in input it means that input string
            // has been truncated. It's reported as E_NOTICE.
            // PHP8 will also throw E_WARNING on unsupported encoding.
            set_error_handler($error_handler, E_NOTICE | E_WARNING);

            try {
                $out = iconv($from, $to . $iconv_options, $str);
            }
            catch (Throwable $e) {
                $out = false;
            }
            catch (Exception $e) {
                $out = false;
            }

            restore_error_handler();

            if ($out !== false) {
                return $out;
            }
        }

        // return the original string
        return $str;
    }

    /**
     * Converts string from standard UTF-7 (RFC 2152) to UTF-8.
     *
     * @param string $str Input string (UTF-7)
     *
     * @return string Converted string (UTF-8)
     * @deprecated use self::convert()
     */
    public static function utf7_to_utf8($str)
    {
        return self::convert($str, 'UTF-7', 'UTF-8');
    }

    /**
     * Converts string from UTF-16 to UTF-8 (helper for utf-7 to utf-8 conversion)
     *
     * @param string $str Input string
     *
     * @return string The converted string
     * @deprecated use self::convert()
     */
    public static function utf16_to_utf8($str)
    {
        return self::convert($str, 'UTF-16BE', 'UTF-8');
    }

    /**
     * Convert the data ($str) from RFC 2060's UTF-7 to UTF-8.
     * If input data is invalid, return the original input string.
     * RFC 2060 obviously intends the encoding to be unique (see
     * point 5 in section 5.1.3), so we reject any non-canonical
     * form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead
     * of &AMAAwA-).
     *
     * @param string $str Input string (UTF7-IMAP)
     *
     * @return string Output string (UTF-8)
     * @deprecated use self::convert()
     */
    public static function utf7imap_to_utf8($str)
    {
        return self::convert($str, 'UTF7-IMAP', 'UTF-8');
    }

    /**
     * Convert the data ($str) from UTF-8 to RFC 2060's UTF-7.
     * Unicode characters above U+FFFF are replaced by U+FFFE.
     * If input data is invalid, return an empty string.
     *
     * @param string $str Input string (UTF-8)
     *
     * @return string Output string (UTF7-IMAP)
     * @deprecated use self::convert()
     */
    public static function utf8_to_utf7imap($str)
    {
        return self::convert($str, 'UTF-8', 'UTF7-IMAP');
    }

    /**
     * A method to guess character set of a string.
     *
     * @param string $string   String
     * @param string $failover Default result for failover
     * @param string $language User language
     *
     * @return string Charset name
     */
    public static function detect($string, $failover = null, $language = null)
    {
        if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE';  // Big Endian
        if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE';  // Little Endian
        if (substr($string, 0, 2) == "\xFE\xFF")     return 'UTF-16BE';  // Big Endian
        if (substr($string, 0, 2) == "\xFF\xFE")     return 'UTF-16LE';  // Little Endian
        if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8';

        // heuristics
        if (strlen($string) >= 4) {
            if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE';
            if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE';
            if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE';
            if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE';
        }

        if (empty($language)) {
            $rcube    = rcube::get_instance();
            $language = $rcube->get_user_language();
        }

        // Prioritize charsets according to current language (#1485669)
        $prio = null;
        switch ($language) {
        case 'ja_JP':
            $prio = ['ISO-2022-JP', 'JIS', 'UTF-8', 'EUC-JP', 'eucJP-win', 'SJIS', 'SJIS-win'];
            break;

        case 'zh_CN':
        case 'zh_TW':
            $prio = ['UTF-8', 'BIG-5', 'GB2312', 'EUC-TW'];
            break;

        case 'ko_KR':
            $prio = ['UTF-8', 'EUC-KR', 'ISO-2022-KR'];
            break;

        case 'ru_RU':
            $prio = ['UTF-8', 'WINDOWS-1251', 'KOI8-R'];
            break;

        case 'tr_TR':
            $prio = ['UTF-8', 'ISO-8859-9', 'WINDOWS-1254'];
            break;
        }

        // mb_detect_encoding() is not reliable for some charsets (#1490135)
        // use mb_check_encoding() to make charset priority lists really working
        if (!empty($prio) && function_exists('mb_check_encoding')) {
            foreach ($prio as $encoding) {
                if (mb_check_encoding($string, $encoding)) {
                    return $encoding;
                }
            }
        }

        if (function_exists('mb_detect_encoding')) {
            if (empty($prio)) {
                $prio = ['UTF-8', 'SJIS', 'GB2312',
                    'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
                    'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',
                    'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',
                    'WINDOWS-1252', 'WINDOWS-1251', 'EUC-JP', 'EUC-TW', 'KOI8-R', 'BIG-5',
                    'ISO-2022-KR', 'ISO-2022-JP',
                ];
            }

            $encodings = array_unique(array_merge($prio, mb_list_encodings()));

            if ($encoding = mb_detect_encoding($string, $encodings)) {
                return $encoding;
            }
        }

        // No match, check for UTF-8
        // from http://w3.org/International/questions/qa-forms-utf-8.html
        if (preg_match('/\A(
            [\x09\x0A\x0D\x20-\x7E]
            | [\xC2-\xDF][\x80-\xBF]
            | \xE0[\xA0-\xBF][\x80-\xBF]
            | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}
            | \xED[\x80-\x9F][\x80-\xBF]
            | \xF0[\x90-\xBF][\x80-\xBF]{2}
            | [\xF1-\xF3][\x80-\xBF]{3}
            | \xF4[\x80-\x8F][\x80-\xBF]{2}
            )*\z/xs', substr($string, 0, 2048))
        ) {
            return 'UTF-8';
        }

        return $failover;
    }

    /**
     * Removes non-unicode characters from input.
     * If the input is an array, both values and keys will be cleaned up.
     *
     * @param mixed $input String or array.
     *
     * @return mixed String or array
     */
    public static function clean($input)
    {
        // handle input of type array
        if (is_array($input)) {
            foreach (array_keys($input) as $key) {
                $k = is_string($key) ? self::clean($key) : $key;
                $v = self::clean($input[$key]);

                if ($k !== $key) {
                    unset($input[$key]);
                    if (!array_key_exists($k, $input)) {
                        $input[$k] = $v;
                    }
                }
                else {
                    $input[$k] = $v;
                }
            }
            return $input;
        }

        if (!is_string($input) || $input == '') {
            return $input;
        }

        $msch = mb_substitute_character();
        mb_substitute_character('none');
        $res = mb_convert_encoding($input, 'UTF-8', 'UTF-8');
        mb_substitute_character($msch);

        return $res;
    }
}