RoundCube Webmail
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

971 lines
39 KiB

4 months ago
  1. <?php
  2. use Dom\HTMLDocument;
  3. use Masterminds\HTML5;
  4. /*
  5. +-----------------------------------------------------------------------+
  6. | This file is part of the Roundcube Webmail client |
  7. | |
  8. | Copyright (C) The Roundcube Dev Team |
  9. | |
  10. | Licensed under the GNU General Public License version 3 or |
  11. | any later version with exceptions for skins & plugins. |
  12. | See the README file for a full license statement. |
  13. | |
  14. | PURPOSE: |
  15. | Utility class providing HTML sanitizer (based on Washtml class) |
  16. +-----------------------------------------------------------------------+
  17. | Author: Thomas Bruederli <roundcube@gmail.com> |
  18. | Author: Aleksander Machniak <alec@alec.pl> |
  19. | Author: Frederic Motte <fmotte@ubixis.com> |
  20. +-----------------------------------------------------------------------+
  21. Washtml, a HTML sanitizer.
  22. Copyright (c) 2007 Frederic Motte <fmotte@ubixis.com>
  23. All rights reserved.
  24. Redistribution and use in source and binary forms, with or without
  25. modification, are permitted provided that the following conditions
  26. are met:
  27. 1. Redistributions of source code must retain the above copyright
  28. notice, this list of conditions and the following disclaimer.
  29. 2. Redistributions in binary form must reproduce the above copyright
  30. notice, this list of conditions and the following disclaimer in the
  31. documentation and/or other materials provided with the distribution.
  32. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  33. IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  34. OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  35. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  36. INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  37. NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  38. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  39. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  40. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  41. THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42. */
  43. /**
  44. * Utility class providing HTML sanitizer
  45. */
  46. class rcube_washtml
  47. {
  48. /**
  49. * @var array Allowed HTML elements (default)
  50. */
  51. public static $html_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
  52. 'basefont', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center',
  53. 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
  54. 'dt', 'em', 'fieldset', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
  55. 'ins', 'label', 'legend', 'li', 'map', 'menu', 'nobr', 'ol', 'p', 'pre', 'q',
  56. 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
  57. 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'wbr', 'img',
  58. 'video', 'source',
  59. // form elements
  60. 'button', 'input', 'textarea', 'select', 'option', 'optgroup',
  61. // SVG
  62. 'svg', 'altglyph', 'altglyphdef', 'altglyphitem', 'animate',
  63. 'animatecolor', 'animatetransform', 'circle', 'clippath', 'defs', 'desc',
  64. 'ellipse', 'font', 'g', 'glyph', 'glyphref', 'hkern', 'image', 'line',
  65. 'lineargradient', 'marker', 'mask', 'mpath', 'path', 'pattern',
  66. 'polygon', 'polyline', 'radialgradient', 'rect', 'set', 'stop', 'switch', 'symbol',
  67. 'text', 'textpath', 'tref', 'tspan', 'use', 'view', 'vkern', 'filter',
  68. // SVG Filters
  69. 'feblend', 'fecolormatrix', 'fecomponenttransfer', 'fecomposite',
  70. 'feconvolvematrix', 'fediffuselighting', 'fedisplacementmap',
  71. 'feflood', 'fefunca', 'fefuncb', 'fefuncg', 'fefuncr', 'fegaussianblur',
  72. 'feimage', 'femerge', 'femergenode', 'femorphology', 'feoffset',
  73. 'fespecularlighting', 'fetile', 'feturbulence',
  74. // MathML
  75. 'math', 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mlabeledtr',
  76. 'mmuliscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', 'mroot', 'mrow',
  77. 'ms', 'mspace', 'msqrt', 'mstyle', 'msub', 'msup', 'msubsup', 'mtable', 'mtd',
  78. 'mtext', 'mtr', 'munder', 'munderover', 'maligngroup', 'malignmark',
  79. 'mprescripts', 'semantics', 'annotation', 'annotation-xml', 'none',
  80. 'infinity', 'matrix', 'matrixrow', 'ci', 'cn', 'sep', 'apply',
  81. 'plus', 'minus', 'eq', 'power', 'times', 'divide', 'csymbol', 'root',
  82. 'bvar', 'lowlimit', 'uplimit',
  83. ];
  84. /**
  85. * @var array Ignore these HTML tags and their content
  86. */
  87. public static $ignore_elements = ['script', 'applet', 'embed', 'style'];
  88. /**
  89. * @var array Allowed HTML attributes
  90. */
  91. public static $html_attribs = ['name', 'class', 'title', 'alt', 'width', 'height',
  92. 'align', 'nowrap', 'col', 'row', 'id', 'rowspan', 'colspan', 'cellspacing',
  93. 'cellpadding', 'valign', 'bgcolor', 'color', 'border', 'bordercolorlight',
  94. 'bordercolordark', 'face', 'marginwidth', 'marginheight', 'axis', 'border',
  95. 'abbr', 'char', 'charoff', 'clear', 'compact', 'coords', 'vspace', 'hspace',
  96. 'cellborder', 'size', 'lang', 'dir', 'usemap', 'shape', 'media',
  97. 'background', 'src', 'poster', 'href', 'headers', 'start', 'reversed',
  98. // attributes of form elements
  99. 'type', 'rows', 'cols', 'disabled', 'readonly', 'checked', 'multiple', 'value', 'for',
  100. // SVG
  101. 'accent-height', 'accumulate', 'additive', 'alignment-baseline', 'alphabetic',
  102. 'ascent', 'attributename', 'attributetype', 'azimuth', 'basefrequency', 'baseprofile',
  103. 'baseline-shift', 'begin', 'bias', 'by', 'clip', 'clip-path', 'clip-rule',
  104. 'color', 'color-interpolation', 'color-interpolation-filters', 'color-profile',
  105. 'color-rendering', 'cx', 'cy', 'd', 'dx', 'dy', 'diffuseconstant', 'direction',
  106. 'display', 'divisor', 'dur', 'edgemode', 'elevation', 'end', 'fill', 'fill-opacity',
  107. 'fill-rule', 'filter', 'flood-color', 'flood-opacity', 'font-family', 'font-size',
  108. 'font-size-adjust', 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
  109. 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'glyphref', 'gradientunits', 'gradienttransform',
  110. 'image-rendering', 'in', 'in2', 'k', 'k1', 'k2', 'k3', 'k4', 'kerning', 'keypoints',
  111. 'keysplines', 'keytimes', 'lengthadjust', 'letter-spacing', 'kernelmatrix',
  112. 'kernelunitlength', 'lighting-color', 'local', 'marker-end', 'marker-mid',
  113. 'marker-start', 'markerheight', 'markerunits', 'markerwidth', 'maskcontentunits',
  114. 'maskunits', 'max', 'mask', 'mode', 'min', 'numoctaves', 'offset', 'operator',
  115. 'opacity', 'order', 'orient', 'orientation', 'origin', 'overflow', 'paint-order',
  116. 'path', 'pathlength', 'patterncontentunits', 'patterntransform', 'patternunits',
  117. 'points', 'preservealpha', 'r', 'rx', 'ry', 'radius', 'refx', 'refy', 'repeatcount',
  118. 'repeatdur', 'restart', 'rotate', 'scale', 'seed', 'shape-rendering', 'show', 'specularconstant',
  119. 'specularexponent', 'spreadmethod', 'stddeviation', 'stitchtiles', 'stop-color',
  120. 'stop-opacity', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
  121. 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', 'stroke', 'stroke-width',
  122. 'surfacescale', 'targetx', 'targety', 'transform', 'text-anchor', 'text-decoration',
  123. 'text-rendering', 'textlength', 'to', 'u1', 'u2', 'unicode', 'values', 'viewbox',
  124. 'visibility', 'vert-adv-y', 'version', 'vert-origin-x', 'vert-origin-y', 'word-spacing',
  125. 'wrap', 'writing-mode', 'xchannelselector', 'ychannelselector', 'x', 'x1', 'x2',
  126. 'xmlns', 'y', 'y1', 'y2', 'z', 'zoomandpan',
  127. // MathML
  128. 'accent', 'accentunder', 'bevelled', 'close', 'columnalign', 'columnlines',
  129. 'columnspan', 'denomalign', 'depth', 'display', 'displaystyle', 'encoding', 'fence',
  130. 'frame', 'largeop', 'length', 'linethickness', 'lspace', 'lquote',
  131. 'mathbackground', 'mathcolor', 'mathsize', 'mathvariant', 'maxsize',
  132. 'minsize', 'movablelimits', 'notation', 'numalign', 'open', 'rowalign',
  133. 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'rquote', 'scriptlevel',
  134. 'scriptminsize', 'scriptsizemultiplier', 'selection', 'separator',
  135. 'separators', 'stretchy', 'subscriptshift', 'supscriptshift', 'symmetric', 'voffset',
  136. 'fontsize', 'fontweight', 'fontstyle', 'fontfamily', 'groupalign', 'edge', 'side',
  137. ];
  138. /**
  139. * @var array Elements which could be empty and be returned in short form (<tag />)
  140. */
  141. public static $void_elements = ['area', 'base', 'br', 'col', 'command', 'embed', 'hr',
  142. 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
  143. // MathML
  144. 'sep', 'infinity', 'in', 'plus', 'eq', 'power', 'times', 'divide', 'root',
  145. 'maligngroup', 'none', 'mprescripts',
  146. ];
  147. /**
  148. * @var array Additional allowed attributes of body element
  149. */
  150. public static $body_attribs = ['alink', 'background', 'bgcolor', 'link', 'text', 'vlink'];
  151. /** @var bool State indicating existence of linked objects in HTML */
  152. public $extlinks = false;
  153. /** @var array Current settings */
  154. private $config = [
  155. 'add_comments' => true,
  156. 'allow_remote' => false,
  157. 'base_url' => '',
  158. 'charset' => RCUBE_CHARSET,
  159. 'cid_map' => [],
  160. 'show_washed' => true,
  161. ];
  162. /** @var array Registered callback functions for tags */
  163. private $handlers = [];
  164. /** @var array Allowed HTML elements */
  165. private $_html_elements = [];
  166. /** @var array Ignore these HTML tags but process their content */
  167. private $_ignore_elements = [];
  168. /** @var array Elements which could be empty and be returned in short form (<tag />) */
  169. private $_void_elements = [];
  170. /** @var array Allowed HTML attributes */
  171. private $_html_attribs = [];
  172. /** @var string A prefix to be added to id/class/for attribute values */
  173. private $_css_prefix;
  174. /** @var int Max nesting level */
  175. private $max_nesting_level;
  176. /** @var bool Indicates that a nesting level error was logged */
  177. private $max_nesting_level_error = false;
  178. /** @var bool True if current document is XML */
  179. private $is_xml = false;
  180. /**
  181. * Class constructor
  182. *
  183. * @param array $p Configuration options:
  184. * allow_remote: is a boolean to allow link to remote resources (images/css)
  185. * blocked_src: string with image-src to be used for blocked remote images
  186. * show_washed: is a boolean to include washed out attributes as x-washed
  187. * cid_map: is an array where cid urls index urls to replace them
  188. * charset: is a string containing the charset of the HTML document, to be used if the charset is not defined in the document
  189. * css_prefix: A prefix to be added to id/class/for attribute values
  190. * html_elements: Additional allowed HTML elements
  191. * ignore_elements: Additional HTML elements to ignore
  192. * html_attribs: Additional allowed HTML attributes
  193. * void_elements: Elements which could be empty and be returned in short form (<tag />)
  194. */
  195. public function __construct($p = [])
  196. {
  197. $p['html_elements'] = isset($p['html_elements']) ? (array) $p['html_elements'] : [];
  198. $p['html_attribs'] = isset($p['html_attribs']) ? (array) $p['html_attribs'] : [];
  199. $p['ignore_elements'] = isset($p['ignore_elements']) ? (array) $p['ignore_elements'] : [];
  200. $p['void_elements'] = isset($p['void_elements']) ? (array) $p['void_elements'] : [];
  201. $this->_html_elements = array_flip($p['html_elements']) + array_flip(self::$html_elements);
  202. $this->_html_attribs = array_flip($p['html_attribs']) + array_flip(self::$html_attribs);
  203. $this->_ignore_elements = array_flip($p['ignore_elements']) + array_flip(self::$ignore_elements);
  204. $this->_void_elements = array_flip($p['void_elements']) + array_flip(self::$void_elements);
  205. $this->_css_prefix = isset($p['css_prefix']) && strlen($p['css_prefix']) ? $p['css_prefix'] : null;
  206. unset($p['html_elements'], $p['html_attribs'], $p['ignore_elements'], $p['void_elements'], $p['css_prefix']);
  207. $this->config = array_merge($this->config, $p);
  208. }
  209. /**
  210. * Register a callback function for a certain tag
  211. *
  212. * @param string $tag HTML tag name
  213. * @param callable $callback Callback function
  214. */
  215. public function add_callback($tag, $callback)
  216. {
  217. $this->handlers[$tag] = $callback;
  218. }
  219. /**
  220. * Check CSS style
  221. *
  222. * @param string $style CSS style
  223. *
  224. * @return string Washed CSS style
  225. */
  226. private function wash_style($style)
  227. {
  228. $result = [];
  229. // Decode insecure character sequences
  230. $style = rcube_utils::xss_entity_decode($style);
  231. // Remove unwanted white-space characters
  232. $style = preg_replace('/[\n\r\t]+/', ' ', $style);
  233. $uri_callback = function ($uri) {
  234. if ($uri = $this->wash_uri($uri)) {
  235. return htmlspecialchars($uri, \ENT_QUOTES, $this->config['charset']);
  236. }
  237. };
  238. return rtrim(rcube_utils::sanitize_css_block($style, $uri_callback), ';');
  239. }
  240. /**
  241. * Take a node and return allowed attributes and check values
  242. *
  243. * @param DOMElement $node Document element
  244. *
  245. * @return string Washed element attributes
  246. */
  247. private function wash_attribs($node)
  248. {
  249. $result = '';
  250. $washed = [];
  251. $additional_attribs = [];
  252. if ($node->nodeName == 'body') {
  253. $additional_attribs = self::$body_attribs;
  254. }
  255. foreach ($node->attributes as $name => $attr) {
  256. $key = strtolower($name);
  257. $value = $attr->nodeValue;
  258. if ($key == 'style' && ($style = $this->wash_style($value))) {
  259. // replace double quotes to prevent syntax error and XSS issues (#1490227)
  260. $result .= ' style="' . str_replace('"', '&quot;', $style) . '"';
  261. } elseif (isset($this->_html_attribs[$key]) || in_array($key, $additional_attribs)) {
  262. $value = trim($value);
  263. $out = null;
  264. // in SVG to/from attribs may contain anything, including URIs
  265. if ($key == 'to' || $key == 'from') {
  266. $key = strtolower((string) $node->getAttribute('attributeName'));
  267. if ($key && !isset($this->_html_attribs[$key])) {
  268. $key = null;
  269. }
  270. }
  271. if ($this->is_image_attribute($node->nodeName, $key)) {
  272. $out = $this->wash_uri($value, true);
  273. } elseif ($this->is_link_attribute($node->nodeName, $key)) {
  274. $out = $this->wash_link($value);
  275. } elseif ($this->is_funciri_attribute($node->nodeName, $key)) {
  276. if (preg_match('/^[a-z:]*url\(/i', $value)) {
  277. if (preg_match('/^([a-z:]*url)\(\s*[\'"]?([^\'"\)]*)[\'"]?\s*\)/iu', $value, $match)) {
  278. if ($url = $this->wash_uri($match[2])) {
  279. $result .= ' ' . $attr->nodeName . '="' . $match[1]
  280. . '(' . htmlspecialchars($url, \ENT_QUOTES, $this->config['charset']) . ')'
  281. . htmlspecialchars(substr($value, strlen($match[0])), \ENT_QUOTES, $this->config['charset']) . '"';
  282. continue;
  283. }
  284. } else {
  285. $out = $value;
  286. }
  287. } else {
  288. $out = $value;
  289. }
  290. } elseif ($this->_css_prefix !== null
  291. && (in_array($key, ['id', 'class', 'for']) || ($key == 'name' && $node->nodeName == 'a'))
  292. ) {
  293. $out = preg_replace('/(\S+)/', $this->_css_prefix . '\1', $value);
  294. } elseif ($key == 'xmlns' && !strpos($value, '://')) {
  295. continue;
  296. } elseif ($key) {
  297. $out = $value;
  298. }
  299. if ($out !== null && $out !== '') {
  300. $v = htmlspecialchars($out, \ENT_QUOTES | \ENT_SUBSTITUTE, $this->config['charset']);
  301. $result .= " {$attr->nodeName}=\"{$v}\"";
  302. } elseif ($value) {
  303. $washed[] = htmlspecialchars($attr->nodeName, \ENT_QUOTES, $this->config['charset']);
  304. }
  305. } else {
  306. $washed[] = htmlspecialchars($attr->nodeName, \ENT_QUOTES, $this->config['charset']);
  307. }
  308. }
  309. if (!empty($washed) && $this->config['show_washed']) {
  310. $result .= ' x-washed="' . implode(' ', $washed) . '"';
  311. }
  312. return $result;
  313. }
  314. /**
  315. * Wash URI value
  316. *
  317. * @param string $uri URI
  318. * @param bool $blocked_source Block remote source
  319. * @param bool $is_image URI points to an image
  320. *
  321. * @return string Washed URI
  322. */
  323. private function wash_uri($uri, $blocked_source = false, $is_image = true)
  324. {
  325. if (!empty($this->config['cid_map'][$uri])) {
  326. return $this->config['cid_map'][$uri];
  327. }
  328. $key = $this->config['base_url'] . $uri;
  329. if (!empty($this->config['cid_map'][$key])) {
  330. return $this->config['cid_map'][$key];
  331. }
  332. // allow url(#id) used in SVG
  333. if (isset($uri[0]) && $uri[0] == '#') {
  334. if ($this->_css_prefix !== null) {
  335. $uri = '#' . $this->_css_prefix . substr($uri, 1);
  336. }
  337. return $uri;
  338. }
  339. if (preg_match('/^(http|https|ftp):.+/i', $uri)) {
  340. if (!empty($this->config['allow_remote'])) {
  341. return $uri;
  342. }
  343. $this->extlinks = true;
  344. if ($is_image && !empty($this->config['blocked_src'])) {
  345. return $this->config['blocked_src'];
  346. }
  347. } elseif ($is_image && preg_match('/^data:image\/([^,]+),(.+)$/is', $uri, $matches)) { // RFC2397
  348. $type = preg_replace('/\s/', '', $matches[1]);
  349. // svg images can be insecure, we'll sanitize them
  350. if (stripos($type, 'svg') !== false) {
  351. $svg = $matches[2];
  352. if (stripos($type, ';base64') !== false) {
  353. $svg = base64_decode($svg);
  354. } else {
  355. $type .= ';base64';
  356. }
  357. $washer = new self($this->config);
  358. $svg = $washer->wash($svg);
  359. // Invalid svg content
  360. if (empty($svg)) {
  361. return '';
  362. }
  363. return 'data:image/' . $type . ',' . base64_encode($svg);
  364. }
  365. return $uri;
  366. }
  367. return '';
  368. }
  369. /**
  370. * Wash Href value
  371. *
  372. * @param string $href Href attribute value (link)
  373. *
  374. * @return string Washed href
  375. */
  376. private function wash_link($href)
  377. {
  378. if (strlen($href) && !preg_match('!^(javascript|vbscript|data:)!i', $href)) {
  379. if ($href[0] == '#' && $this->_css_prefix !== null) {
  380. return '#' . $this->_css_prefix . substr($href, 1);
  381. }
  382. if (preg_match('!^[a-zA-Z._-]+$!', $href)) {
  383. return 'http://' . $href;
  384. }
  385. if (preg_match('!^([a-z][a-z0-9.+-]+:|//|#).+!i', $href)) {
  386. return $href;
  387. }
  388. }
  389. return '';
  390. }
  391. /**
  392. * Check it the tag/attribute may contain an URI
  393. *
  394. * @param string $tag Element name
  395. * @param string $attr Attribute name
  396. *
  397. * @return bool True if attribute may contain an URI, False otherwise
  398. */
  399. private function is_link_attribute($tag, $attr)
  400. {
  401. return $attr === 'href';
  402. }
  403. /**
  404. * Check it the tag/attribute may contain an image URI
  405. *
  406. * @param string $tag Element name
  407. * @param string $attr Attribute name
  408. *
  409. * @return bool True if attribute may contain an image URI, False otherwise
  410. */
  411. private function is_image_attribute($tag, $attr)
  412. {
  413. return $attr == 'background'
  414. || $attr == 'color-profile' // SVG
  415. || ($attr == 'poster' && $tag == 'video')
  416. || ($attr == 'src' && preg_match('/^(img|image|source|input|video|audio)$/i', $tag))
  417. || ($tag == 'use' && $attr == 'href') // SVG
  418. || ($tag == 'image' && $attr == 'href'); // SVG
  419. }
  420. /**
  421. * Check it the tag/attribute may contain a FUNCIRI value
  422. *
  423. * @param string $tag Element name
  424. * @param string $attr Attribute name
  425. *
  426. * @return bool True if attribute may contain a FUNCIRI value, False otherwise
  427. */
  428. private function is_funciri_attribute($tag, $attr)
  429. {
  430. return in_array($attr, ['fill', 'filter', 'stroke', 'marker-start',
  431. 'marker-end', 'marker-mid', 'clip-path', 'mask', 'cursor']);
  432. }
  433. /**
  434. * Check if a specified element has an attribute with specified value.
  435. * Do it in case-insensitive manner.
  436. *
  437. * @param DOMElement $node The element
  438. * @param string $attr_name The attribute name
  439. * @param string $attr_value The attribute value to find
  440. *
  441. * @return bool True if the specified attribute exists and has the expected value
  442. */
  443. private static function attribute_value($node, $attr_name, $attr_value)
  444. {
  445. $attr_name = strtolower($attr_name);
  446. foreach ($node->attributes as $name => $attr) {
  447. if (strtolower($name) === $attr_name) {
  448. if (strtolower($attr_value) === strtolower(trim($attr->nodeValue))) {
  449. return true;
  450. }
  451. }
  452. }
  453. return false;
  454. }
  455. /**
  456. * The main loop that recurse on a node tree.
  457. * It output only allowed tags with allowed attributes and allowed inline styles
  458. *
  459. * @param DOMNode $node HTML element
  460. * @param int $level Recurrence level (safe initial value found empirically)
  461. *
  462. * @return string HTML content
  463. */
  464. private function dumpHtml($node, $level = 20)
  465. {
  466. if (!$node->hasChildNodes()) {
  467. return '';
  468. }
  469. $level++;
  470. if ($this->max_nesting_level > 0 && $level == $this->max_nesting_level - 1) {
  471. // log error message once
  472. if (empty($this->max_nesting_level_error)) {
  473. $this->max_nesting_level_error = true;
  474. rcube::raise_error([
  475. 'code' => 500,
  476. 'message' => "Maximum nesting level exceeded (xdebug.max_nesting_level={$this->max_nesting_level})",
  477. ], true, false);
  478. }
  479. return $this->config['add_comments'] ? '<!-- ignored -->' : '';
  480. }
  481. $node = $node->firstChild;
  482. $dump = '';
  483. do {
  484. switch ($node->nodeType) {
  485. case \XML_ELEMENT_NODE:
  486. /** @var DOMElement $node */
  487. $tagName = strtolower($node->nodeName);
  488. if ($tagName == 'link') {
  489. $uri = $this->wash_uri($node->getAttribute('href'), false, false);
  490. if (!$uri) {
  491. if ($this->config['add_comments']) {
  492. $dump .= '<!-- link ignored -->';
  493. }
  494. break;
  495. }
  496. $node->setAttribute('href', (string) $uri);
  497. } elseif (in_array($tagName, ['animate', 'animatecolor', 'set', 'animatetransform'])
  498. && self::attribute_value($node, 'attributename', 'href')
  499. ) {
  500. // Insecure svg tags
  501. if ($this->config['add_comments']) {
  502. $dump .= "<!-- {$tagName} blocked -->";
  503. }
  504. break;
  505. }
  506. if (!empty($this->handlers[$tagName])) {
  507. $callback = $this->handlers[$tagName];
  508. $dump .= call_user_func($callback, $tagName,
  509. $this->wash_attribs($node), $this->dumpHtml($node, $level), $this);
  510. } elseif (isset($this->_html_elements[$tagName])) {
  511. $content = $this->dumpHtml($node, $level);
  512. $tag = '<' . $tagName;
  513. if ($tagName == 'svg') {
  514. if (method_exists($node, 'getInScopeNamespaces')) {
  515. $ns_nodes = $node->getInScopeNamespaces();
  516. } else {
  517. $xpath = new DOMXPath($node->ownerDocument);
  518. $ns_nodes = $xpath->query('namespace::*');
  519. }
  520. foreach ($ns_nodes as $ns) {
  521. if (isset($ns->nodeName) && isset($ns->nodeValue)
  522. && $ns->nodeName != 'xmlns:xml'
  523. && preg_match('/^[a-zA-Z:-]+$/', $ns->nodeName)
  524. && strpos($ns->nodeValue, '://')
  525. ) {
  526. $tag .= sprintf(' %s="%s"',
  527. $ns->nodeName,
  528. htmlspecialchars($ns->nodeValue, \ENT_QUOTES, $this->config['charset'])
  529. );
  530. }
  531. }
  532. } elseif ($tagName == 'textarea' && strpos($content, '<') !== false) {
  533. $content = htmlspecialchars($content, \ENT_QUOTES | \ENT_SUBSTITUTE, $this->config['charset']);
  534. }
  535. $tag .= $this->wash_attribs($node);
  536. if (isset($this->_ignore_elements[$tagName])) {
  537. $dump .= $content;
  538. } elseif ($content === '' && ($this->is_xml || isset($this->_void_elements[$tagName]))) {
  539. $dump .= $tag . ' />';
  540. } else {
  541. $dump .= $tag . '>' . $content . '</' . $tagName . '>';
  542. }
  543. } elseif (isset($this->_ignore_elements[$tagName])) {
  544. if ($this->config['add_comments']) {
  545. $dump .= '<!-- ' . htmlspecialchars($tagName, \ENT_QUOTES, $this->config['charset']) . ' not allowed -->';
  546. }
  547. } else {
  548. if ($this->config['add_comments']) {
  549. $dump .= '<!-- ' . htmlspecialchars($tagName, \ENT_QUOTES, $this->config['charset']) . ' ignored -->';
  550. }
  551. $dump .= $this->dumpHtml($node, $level); // ignore tags not its content
  552. }
  553. break;
  554. case \XML_CDATA_SECTION_NODE:
  555. case \XML_TEXT_NODE:
  556. $dump .= htmlspecialchars($node->nodeValue, \ENT_COMPAT | \ENT_HTML401 | \ENT_SUBSTITUTE, $this->config['charset']);
  557. break;
  558. case \XML_HTML_DOCUMENT_NODE:
  559. $dump .= $this->dumpHtml($node, $level);
  560. break;
  561. }
  562. } while ($node = $node->nextSibling);
  563. return $dump;
  564. }
  565. /**
  566. * Main function, give it untrusted HTML, tell it if you allow loading
  567. * remote images and give it a map to convert "cid:" urls.
  568. *
  569. * @param string $html HTML content
  570. *
  571. * @return string Washed HTML content
  572. */
  573. public function wash($html)
  574. {
  575. $this->extlinks = false;
  576. $html = $this->cleanup($html);
  577. // Find base URL for images
  578. if (preg_match('/<base\s+href=[\'"]*([^\'"]+)/is', $html, $matches)) {
  579. $this->config['base_url'] = $matches[1];
  580. } else {
  581. $this->config['base_url'] = '';
  582. }
  583. // Detect max nesting level (for dumpHTML) (#1489110)
  584. $this->max_nesting_level = (int) @ini_get('xdebug.max_nesting_level');
  585. // SVG need to be parsed as XML
  586. $this->is_xml = !preg_match('/<(html|head|body)/i', $html) && stripos($html, '<svg') !== false;
  587. $method = $this->is_xml ? 'loadXML' : 'loadHTML';
  588. // Try HTML5 parser available in PHP >= 8.4
  589. // TODO: Parse XML also with this new PHP parser (?)
  590. if (!$this->is_xml && class_exists('Dom\HTMLDocument')) {
  591. try {
  592. $options = constant('Dom\HTML_NO_DEFAULT_NS') | \LIBXML_COMPACT | \LIBXML_NOERROR;
  593. $node = HTMLDocument::createFromString($html, $options, $this->config['charset']);
  594. } catch (Exception $e) {
  595. // ignore, fallback to other methods
  596. }
  597. }
  598. // DOMDocument does not support HTML5, try Masterminds parser if available
  599. if (empty($node) && !$this->is_xml && class_exists('Masterminds\HTML5')) {
  600. try {
  601. // disabled_html_ns=true is a workaround for the performance issue
  602. // https://github.com/Masterminds/html5-php/issues/181
  603. $html5 = new HTML5(['disable_html_ns' => true]);
  604. $node = $html5->loadHTML($this->fix_html5($html));
  605. } catch (Exception $e) {
  606. // ignore, fallback to DOMDocument
  607. }
  608. }
  609. if (empty($node)) {
  610. // Charset seems to be ignored (probably if defined in the HTML document)
  611. $node = new DOMDocument('1.0', $this->config['charset']);
  612. @$node->{$method}($html, \LIBXML_PARSEHUGE | \LIBXML_COMPACT | \LIBXML_NONET);
  613. }
  614. unset($html); // release some memory
  615. return $this->dumpHtml($node);
  616. }
  617. /**
  618. * Getter for config parameters
  619. *
  620. * @param string $prop Configuration parameter name
  621. *
  622. * @return mixed Configuration parameter value
  623. */
  624. public function get_config($prop)
  625. {
  626. $config_props = ['html_elements', 'html_attribs', 'ignore_elements', 'void_elements', 'css_prefix'];
  627. if (in_array($prop, $config_props)) {
  628. return $this->{"_{$prop}"};
  629. }
  630. return $this->config[$prop] ?? null;
  631. }
  632. /**
  633. * Clean HTML input
  634. *
  635. * @param string $html HTML content
  636. *
  637. * @return string Clean HTML content
  638. */
  639. private function cleanup($html)
  640. {
  641. $html = trim($html);
  642. // special replacements (not properly handled by washtml class)
  643. $html_search = [
  644. // space(s) between <NOBR>
  645. '/(<\/nobr>)(\s+)(<nobr>)/i',
  646. // PHP bug #32547 workaround: remove title tag
  647. '/<title[^>]*>.*<\/title>/iU',
  648. // remove <!doctype> before BOM (#1490291)
  649. '/<\!doctype[^>]+>[^<]*/im',
  650. // byte-order mark (only outlook?)
  651. '/^(\0\0\xFE\xFF|\xFF\xFE\0\0|\xFE\xFF|\xFF\xFE|\xEF\xBB\xBF)/',
  652. // washtml/DOMDocument cannot handle xml namespaces
  653. '/<html\s[^>]+>/i',
  654. // washtml/DOMDocument cannot handle xml namespaces
  655. // HTML5 parser cannot handler <?xml
  656. '/<\?xml[^>]*>/i',
  657. ];
  658. $html_replace = [
  659. '\1 &nbsp; \3',
  660. '',
  661. '',
  662. '',
  663. '<html>',
  664. '',
  665. ];
  666. $html = preg_replace($html_search, $html_replace, $html);
  667. $err = ['message' => 'Could not clean up HTML!'];
  668. if ($html === null && rcube_utils::preg_error($err)) {
  669. return '';
  670. }
  671. // Replace all of those weird MS Word quotes and other high characters
  672. $badwordchars = [
  673. "\xe2\x80\x98", // left single quote
  674. "\xe2\x80\x99", // right single quote
  675. "\xe2\x80\x9c", // left double quote
  676. "\xe2\x80\x9d", // right double quote
  677. "\xe2\x80\x94", // em dash
  678. "\xe2\x80\xa6", // ellipses
  679. ];
  680. $fixedwordchars = [
  681. "'",
  682. "'",
  683. '"',
  684. '"',
  685. '&mdash;',
  686. '...',
  687. ];
  688. $html = str_replace($badwordchars, $fixedwordchars, $html);
  689. // FIXME: HTML comments handling could be better. The code below can break comments (#6464),
  690. // we should probably do not modify content inside comments at all.
  691. // fix (unknown/malformed) HTML tags before "wash"
  692. $html = preg_replace_callback('/(<(?!\!)[\/]*)([^\s>]+)([^>]*)/', [$this, 'html_tag_callback'], $html);
  693. // Remove invalid HTML comments (#1487759)
  694. // Note: We don't want to remove valid comments, conditional comments
  695. // and MSOutlook comments (<!-->)
  696. $html = preg_replace('/<!--[a-zA-Z0-9]+>/', '', $html);
  697. // fix broken nested lists
  698. self::fix_broken_lists($html);
  699. // turn relative into absolute urls
  700. $html = self::resolve_base($html);
  701. return $html;
  702. }
  703. /**
  704. * Callback function for HTML tags fixing
  705. *
  706. * @param array $matches Matched elements (from preg_replace_callback())
  707. *
  708. * @return string Replacement string
  709. */
  710. public static function html_tag_callback($matches)
  711. {
  712. // It might be an ending of a comment, ignore (#6464)
  713. if (substr($matches[3], -2) == '--') {
  714. $matches[0] = '';
  715. return implode('', $matches);
  716. }
  717. $tagname = $matches[2];
  718. $tagname = preg_replace([
  719. '/:.*$/', // Microsoft's Smart Tags <st1:xxxx>
  720. '/[^a-z0-9_\[\]\!?-]/i', // forbidden characters
  721. ], '', $tagname);
  722. // fix invalid closing tags - remove any attributes (#1489446)
  723. if ($matches[1] == '</') {
  724. $matches[3] = '';
  725. }
  726. return $matches[1] . $tagname . $matches[3];
  727. }
  728. /**
  729. * Convert all relative URLs according to a <base> in HTML
  730. *
  731. * @param string $body HTML body
  732. *
  733. * @return string HTML body
  734. */
  735. public static function resolve_base($body)
  736. {
  737. // check for <base href=...>
  738. if (preg_match('!(<base.*href=["\']?)([hftps]{3,5}://[a-z0-9/.%-]+)!i', $body, $regs)) {
  739. $replacer = new rcube_base_replacer($regs[2]);
  740. $body = $replacer->replace($body);
  741. }
  742. return $body;
  743. }
  744. /**
  745. * Fix broken nested lists, they are not handled properly by DOMDocument (#1488768)
  746. *
  747. * @param string &$html HTML content
  748. */
  749. public static function fix_broken_lists(&$html)
  750. {
  751. // do two rounds, one for <ol>, one for <ul>
  752. foreach (['ol', 'ul'] as $tag) {
  753. $pos = 0;
  754. while (($pos = stripos($html, '<' . $tag, $pos)) !== false) {
  755. $pos++;
  756. // make sure this is an ol/ul tag
  757. if (!in_array($html[$pos + 2], [' ', '>'])) {
  758. continue;
  759. }
  760. $p = $pos;
  761. $in_li = false;
  762. $li_pos = 0;
  763. while (($p = strpos($html, '<', $p)) !== false) {
  764. $tt = strtolower(substr($html, $p, 4));
  765. // li open tag
  766. if ($tt == '<li>' || $tt == '<li ') {
  767. $in_li = true;
  768. $p += 4;
  769. }
  770. // li close tag
  771. elseif ($tt == '</li' && in_array($html[$p + 4], [' ', '>'])) {
  772. $li_pos = $p;
  773. $in_li = false;
  774. $p += 4;
  775. }
  776. // ul/ol closing tag
  777. elseif ($tt == '</' . $tag && in_array($html[$p + 4], [' ', '>'])) {
  778. break;
  779. }
  780. // nested ol/ul element out of li
  781. elseif (!$in_li && $li_pos && ($tt == '<ol>' || $tt == '<ol ' || $tt == '<ul>' || $tt == '<ul ')) {
  782. // find closing tag of this ul/ol element
  783. $element = substr($tt, 1, 2);
  784. $cpos = $p;
  785. do {
  786. $tpos = stripos($html, '<' . $element, $cpos + 1);
  787. $cpos = stripos($html, '</' . $element, $cpos + 1);
  788. } while ($tpos !== false && $cpos !== false && $cpos > $tpos);
  789. // not found, this is invalid HTML, skip it
  790. if ($cpos === false) {
  791. break;
  792. }
  793. // get element content
  794. $end = strpos($html, '>', $cpos);
  795. $len = $end - $p + 1;
  796. $element = substr($html, $p, $len);
  797. // move element to the end of the last li
  798. $html = substr_replace($html, '', $p, $len);
  799. $html = substr_replace($html, $element, $li_pos, 0);
  800. $p = $end;
  801. } else {
  802. $p++;
  803. }
  804. }
  805. }
  806. }
  807. }
  808. /**
  809. * Cleanup and workarounds on input to Masterminds/HTML5
  810. *
  811. * @param string $html HTML content
  812. *
  813. * @return string HTML content
  814. */
  815. protected function fix_html5($html)
  816. {
  817. // There might be content before html/body tag, we'll move it to the body
  818. // We'll wrap it by a div container, it's an invalid HTML anyway
  819. $prefix = '';
  820. if (strpos($html, '<')) {
  821. $pos = stripos($html, '<!DOCTYPE') ?: stripos($html, '<html') ?: stripos($html, '<body');
  822. $prefix = '<div>' . substr($html, 0, $pos) . '</div>';
  823. $html = substr($html, $pos);
  824. }
  825. // HTML5 requires <head> or <body> (#6713)
  826. // https://github.com/Masterminds/html5-php/issues/166
  827. if ($prefix !== '' || !preg_match('/<(head|body)/i', $html)) {
  828. $body_pos = stripos($html, '<body');
  829. $pos = $body_pos !== false ? $body_pos : stripos($html, '<html');
  830. // No HTML and no BODY tag
  831. if ($pos === false) {
  832. $html = '<html><body>' . $prefix . $html;
  833. }
  834. // Either HTML or BODY tag found
  835. else {
  836. $pos = strpos($html, '>', $pos);
  837. $html = substr_replace($html, ($body_pos === false ? '<body>' : '') . $prefix, $pos + 1, 0);
  838. }
  839. }
  840. // Workaround for HTML5 issue with "invalid" table structure (#7356)
  841. $html = preg_replace('|<tr>\s*<tr>|', '<tr>', $html);
  842. $html = preg_replace('|</tr>\s*</tr>|', '</tr>', $html);
  843. return $html;
  844. }
  845. }