Browse Source

Remove use of unreliable charset detection (#8344)

pull/8150/merge
Aleksander Machniak 3 years ago
parent
commit
f23aaed10e
  1. 1
      CHANGELOG.md
  2. 41
      program/lib/Roundcube/rcube_charset.php
  3. 12
      program/lib/Roundcube/rcube_csv2vcard.php
  4. 2
      program/lib/Roundcube/rcube_imap.php
  5. 18
      program/lib/Roundcube/rcube_message.php
  6. 27
      program/lib/Roundcube/rcube_vcard.php
  7. BIN
      tests/src/Csv2vcard/gmail.csv
  8. 0
      tests/src/utf-16_sample.vcf

1
CHANGELOG.md

@ -4,6 +4,7 @@
- Update to jQuery-UI 1.13.1 (#8455)
- Use navigator.pdfViewerEnabled for PDF viewer detection
- Remove use of unreliable charset detection (#8344)
- Password: Add support for ssha256 algorithm (#8459)
- Fix slow loading of long HTML content into the HTML editor (#8108)
- Fix bug where SMTP password didn't work if it contained '%p' (#8435)

41
program/lib/Roundcube/rcube_charset.php

@ -348,6 +348,46 @@ class rcube_charset
return $str;
}
/**
* Check if the specified input string matches one of the provided charsets.
* This includes UTF-32, UTF-16, RCUBE_CHARSET and default_charset.
*
* @param string $str Input string
* @param array $from Suspected charsets of the input string
*
* @return string|null First matching charset
*/
public static function check($str, $charsets = [])
{
$chunk = strlen($str) > 100 * 1024 ? substr($str, 0, 100 * 1024) : $str;
// Add dehault charset, system charset and easily detectable charset to the list
if (substr($chunk, 0, 4) == "\0\0\xFE\xFF") $charsets[] = 'UTF-32BE';
if (substr($chunk, 0, 4) == "\xFF\xFE\0\0") $charsets[] = 'UTF-32LE';
if (substr($chunk, 0, 2) == "\xFE\xFF") $charsets[] = 'UTF-16BE';
if (substr($chunk, 0, 2) == "\xFF\xFE") $charsets[] = 'UTF-16LE';
// heuristics
if (preg_match('/\x00\x00\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-32BE';
if (preg_match('/[^\x00]\x00\x00\x00/', $chunk)) $charsets[] = 'UTF-32LE';
if (preg_match('/\x00[^\x00]\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-16BE';
if (preg_match('/[^\x00]\x00[^\x00]\x00/', $chunk)) $charsets[] = 'UTF-16LE';
$charsets[] = RCUBE_CHARSET;
$charsets[] = (string) rcube::get_instance()->config->get('default_charset');
$charsets = array_map(['rcube_charset', 'parse_charset'], $charsets);
$charsets = array_unique(array_filter($charsets));
foreach ($charsets as $charset) {
$ret = self::convert($chunk, $charset);
if ($ret === rcube_charset::clean($ret)) {
return $charset;
}
}
}
/**
* Converts string from standard UTF-7 (RFC 2152) to UTF-8.
*
@ -415,6 +455,7 @@ class rcube_charset
* @param string $language User language
*
* @return string Charset name
* @deprecated
*/
public static function detect($string, $failover = null, $language = null)
{

12
program/lib/Roundcube/rcube_csv2vcard.php

@ -420,12 +420,12 @@ class rcube_csv2vcard
*/
public function import($csv, $dry_run = false, $skip_head = true)
{
// convert to UTF-8
$head = substr($csv, 0, 4096);
$charset = rcube_charset::detect($head, RCUBE_CHARSET);
$csv = rcube_charset::convert($csv, $charset);
$csv = preg_replace(['/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'], '', $csv); // also remove BOM
$head = '';
// convert to UTF-8 (supports default_charset and RCUBE_CHARSET as input)
// TODO: If the input charset is invalid we should probably just abort here
if ($charset = rcube_charset::check($csv)) {
$csv = rcube_charset::convert($csv, $charset);
}
$csv = preg_replace(['/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'], '', $csv); // also remove BOM
// Split CSV file into lines
$lines = rcube_utils::explode_quoted_string('[\r\n]+', $csv);

2
program/lib/Roundcube/rcube_imap.php

@ -2362,7 +2362,7 @@ class rcube_imap extends rcube_storage
$charset = $this->struct_charset;
}
else {
$charset = rcube_charset::detect($filename_mime, $this->default_charset);
$charset = $this->default_charset;
}
$part->filename = rcube_mime::decode_mime_string($filename_mime, $charset);

18
program/lib/Roundcube/rcube_message.php

@ -1223,21 +1223,9 @@ class rcube_message
$charsets[] = $this->headers->charset;
}
if (empty($charsets)) {
$rcube = rcube::get_instance();
$charsets[] = rcube_charset::detect($name, $rcube->config->get('default_charset', RCUBE_CHARSET));
}
foreach (array_unique($charsets) as $charset) {
$_name = rcube_charset::convert($name, $charset);
if ($_name == rcube_charset::clean($_name)) {
if (!$part->charset) {
$part->charset = $charset;
}
return $_name;
}
if ($charset = rcube_charset::check($name, $charsets)) {
$name = rcube_charset::convert($name, $charset);
$part->charset = $charset;
}
return $name;

27
program/lib/Roundcube/rcube_vcard.php

@ -994,21 +994,6 @@ class rcube_vcard
*/
private static function detect_encoding($string)
{
// Detect common encodings
if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE'; // Big Endian
if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE'; // Little Endian
if (substr($string, 0, 2) == "\xFE\xFF") return 'UTF-16BE'; // Big Endian
if (substr($string, 0, 2) == "\xFF\xFE") return 'UTF-16LE'; // Little Endian
if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8';
// heuristics
if (strlen($string) >= 4) {
if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE';
if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE';
if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE';
if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE';
}
// Extract the plain text from the vCard, so the detection is more accurate
// This will for example exclude photos
@ -1024,6 +1009,9 @@ class rcube_vcard
$prefix = substr($lines[$i], 0, $pos);
// We remove \0 as so it works with UTF-16/UTF-32 encodings
$prefix = str_replace("\0", '', $prefix);
// Take only properties that are known to contain human-readable text
if (!preg_match('/^(item\d+\.)?(N|FN|ORG|ADR|NOTE|TITLE|CATEGORIES)(;|$)/', $prefix)) {
continue;
@ -1060,10 +1048,13 @@ class rcube_vcard
}
$string .= $data . ' ';
}
$fallback = rcube::get_instance()->config->get('default_charset', 'ISO-8859-1'); // fallback to Latin-1
// 100 KB should be enough for charset check
if (strlen($string) > 100 * 1024) {
break;
}
}
return rcube_charset::detect($string, $fallback);
return rcube_charset::check($string) ?: RCUBE_CHARSET;
}
}

BIN
tests/src/Csv2vcard/gmail.csv

0
tests/src/utf-16_sample.vcf

Loading…
Cancel
Save