Remove use of unreliable charset detection (#8344)

3 years ago · f23aaed10e
8 changed files with 61 additions and 40 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,7 @@
 - Update to jQuery-UI 1.13.1 (#8455)
 - Use navigator.pdfViewerEnabled for PDF viewer detection
 - Remove use of unreliable charset detection (#8344)
 - Password: Add support for ssha256 algorithm (#8459)
 - Fix slow loading of long HTML content into the HTML editor (#8108)
 - Fix bug where SMTP password didn't work if it contained '%p' (#8435)
--- a/program/lib/Roundcube/rcube_charset.php
+++ b/program/lib/Roundcube/rcube_charset.php
@ -348,6 +348,46 @@ class rcube_charset
        return $str;
    }
    /**
     * Check if the specified input string matches one of the provided charsets.
     * This includes UTF-32, UTF-16, RCUBE_CHARSET and default_charset.
     *
     * @param string $str  Input string
     * @param array  $from Suspected charsets of the input string
     *
     * @return string|null First matching charset
     */
    public static function check($str, $charsets = [])
    {
        $chunk = strlen($str) > 100 * 1024 ? substr($str, 0, 100 * 1024) : $str;
        // Add dehault charset, system charset and easily detectable charset to the list
        if (substr($chunk, 0, 4) == "\0\0\xFE\xFF") $charsets[] = 'UTF-32BE';
        if (substr($chunk, 0, 4) == "\xFF\xFE\0\0") $charsets[] = 'UTF-32LE';
        if (substr($chunk, 0, 2) == "\xFE\xFF")     $charsets[] = 'UTF-16BE';
        if (substr($chunk, 0, 2) == "\xFF\xFE")     $charsets[] = 'UTF-16LE';
        // heuristics
        if (preg_match('/\x00\x00\x00[^\x00]/', $chunk))    $charsets[] = 'UTF-32BE';
        if (preg_match('/[^\x00]\x00\x00\x00/', $chunk))    $charsets[] = 'UTF-32LE';
        if (preg_match('/\x00[^\x00]\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-16BE';
        if (preg_match('/[^\x00]\x00[^\x00]\x00/', $chunk)) $charsets[] = 'UTF-16LE';
        $charsets[] = RCUBE_CHARSET;
        $charsets[] = (string) rcube::get_instance()->config->get('default_charset');
        $charsets = array_map(['rcube_charset', 'parse_charset'], $charsets);
        $charsets = array_unique(array_filter($charsets));
        foreach ($charsets as $charset) {
            $ret = self::convert($chunk, $charset);
            if ($ret === rcube_charset::clean($ret)) {
                return $charset;
            }
        }
    }
    /**
     * Converts string from standard UTF-7 (RFC 2152) to UTF-8.
     *
@ -415,6 +455,7 @@ class rcube_charset
     * @param string $language User language
     *
     * @return string Charset name
     * @deprecated
     */
    public static function detect($string, $failover = null, $language = null)
    {
--- a/program/lib/Roundcube/rcube_csv2vcard.php
+++ b/program/lib/Roundcube/rcube_csv2vcard.php
@ -420,12 +420,12 @@ class rcube_csv2vcard
     */
    public function import($csv, $dry_run = false, $skip_head = true)
    {
        // convert to UTF-8
        $head      = substr($csv, 0, 4096);
        $charset   = rcube_charset::detect($head, RCUBE_CHARSET);
        $csv       = rcube_charset::convert($csv, $charset);
        $csv       = preg_replace(['/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'], '', $csv); // also remove BOM
        $head      = '';
        // convert to UTF-8 (supports default_charset and RCUBE_CHARSET as input)
        // TODO: If the input charset is invalid we should probably just abort here
        if ($charset = rcube_charset::check($csv)) {
            $csv = rcube_charset::convert($csv, $charset);
        }
        $csv = preg_replace(['/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'], '', $csv); // also remove BOM
        // Split CSV file into lines
        $lines = rcube_utils::explode_quoted_string('[\r\n]+', $csv);
--- a/program/lib/Roundcube/rcube_imap.php
+++ b/program/lib/Roundcube/rcube_imap.php
@ -2362,7 +2362,7 @@ class rcube_imap extends rcube_storage
                $charset = $this->struct_charset;
            }
            else {
                $charset = rcube_charset::detect($filename_mime, $this->default_charset);
                $charset = $this->default_charset;
            }
            $part->filename = rcube_mime::decode_mime_string($filename_mime, $charset);
--- a/program/lib/Roundcube/rcube_message.php
+++ b/program/lib/Roundcube/rcube_message.php
@ -1223,21 +1223,9 @@ class rcube_message
            $charsets[] = $this->headers->charset;
        }
        if (empty($charsets)) {
            $rcube      = rcube::get_instance();
            $charsets[] = rcube_charset::detect($name, $rcube->config->get('default_charset', RCUBE_CHARSET));
        }
        foreach (array_unique($charsets) as $charset) {
            $_name = rcube_charset::convert($name, $charset);
            if ($_name == rcube_charset::clean($_name)) {
                if (!$part->charset) {
                    $part->charset = $charset;
                }
                return $_name;
            }
        if ($charset = rcube_charset::check($name, $charsets)) {
            $name = rcube_charset::convert($name, $charset);
            $part->charset = $charset;
        }
        return $name;
--- a/program/lib/Roundcube/rcube_vcard.php
+++ b/program/lib/Roundcube/rcube_vcard.php
@ -994,21 +994,6 @@ class rcube_vcard
     */
    private static function detect_encoding($string)
    {
        // Detect common encodings
        if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE';  // Big Endian
        if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE';  // Little Endian
        if (substr($string, 0, 2) == "\xFE\xFF")     return 'UTF-16BE';  // Big Endian
        if (substr($string, 0, 2) == "\xFF\xFE")     return 'UTF-16LE';  // Little Endian
        if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8';
        // heuristics
        if (strlen($string) >= 4) {
            if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE';
            if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE';
            if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE';
            if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE';
        }
        // Extract the plain text from the vCard, so the detection is more accurate
        // This will for example exclude photos
@ -1024,6 +1009,9 @@ class rcube_vcard
            $prefix = substr($lines[$i], 0, $pos);
            // We remove \0 as so it works with UTF-16/UTF-32 encodings
            $prefix = str_replace("\0", '', $prefix);
            // Take only properties that are known to contain human-readable text
            if (!preg_match('/^(item\d+\.)?(N|FN|ORG|ADR|NOTE|TITLE|CATEGORIES)(;|$)/', $prefix)) {
                continue;
@ -1060,10 +1048,13 @@ class rcube_vcard
            }
            $string .= $data . ' ';
        }
        $fallback = rcube::get_instance()->config->get('default_charset', 'ISO-8859-1'); // fallback to Latin-1
            // 100 KB should be enough for charset check
            if (strlen($string) > 100 * 1024) {
                break;
            }
        }
        return rcube_charset::detect($string, $fallback);
        return rcube_charset::check($string) ?: RCUBE_CHARSET;
    }
 }
--- a/tests/src/Csv2vcard/gmail.csv
+++ b/tests/src/Csv2vcard/gmail.csv
--- a/tests/src/utf-16_sample.vcf
+++ b/tests/src/utf-16_sample.vcf