aeb70391ef2574c01f4015939c88c8c0b42968e3
[moodle.git] / lib / simplepie / library / SimplePie / Sanitize.php
1 <?php
2 /**
3  * SimplePie
4  *
5  * A PHP-Based RSS and Atom Feed Framework.
6  * Takes the hard work out of managing a complete RSS/Atom solution.
7  *
8  * Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without modification, are
12  * permitted provided that the following conditions are met:
13  *
14  *      * Redistributions of source code must retain the above copyright notice, this list of
15  *        conditions and the following disclaimer.
16  *
17  *      * Redistributions in binary form must reproduce the above copyright notice, this list
18  *        of conditions and the following disclaimer in the documentation and/or other materials
19  *        provided with the distribution.
20  *
21  *      * Neither the name of the SimplePie Team nor the names of its contributors may be used
22  *        to endorse or promote products derived from this software without specific prior
23  *        written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28  * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  * POSSIBILITY OF SUCH DAMAGE.
34  *
35  * @package SimplePie
36  * @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
37  * @author Ryan Parman
38  * @author Geoffrey Sneddon
39  * @author Ryan McCue
40  * @link http://simplepie.org/ SimplePie
41  * @license http://www.opensource.org/licenses/bsd-license.php BSD License
42  */
44 /**
45  * Used for data cleanup and post-processing
46  *
47  *
48  * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
49  *
50  * @package SimplePie
51  * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
52  */
53 class SimplePie_Sanitize
54 {
55         // Private vars
56         var $base;
58         // Options
59         var $remove_div = true;
60         var $image_handler = '';
61         var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
62         var $encode_instead_of_strip = false;
63         var $strip_attributes = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
64         var $add_attributes = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none'));
65         var $strip_comments = false;
66         var $output_encoding = 'UTF-8';
67         var $enable_cache = true;
68         var $cache_location = './cache';
69         var $cache_name_function = 'md5';
70         var $timeout = 10;
71         var $useragent = '';
72         var $force_fsockopen = false;
73         var $replace_url_attributes = null;
75         public function __construct()
76         {
77                 // Set defaults
78                 $this->set_url_replacements(null);
79         }
81         public function remove_div($enable = true)
82         {
83                 $this->remove_div = (bool) $enable;
84         }
86         public function set_image_handler($page = false)
87         {
88                 if ($page)
89                 {
90                         $this->image_handler = (string) $page;
91                 }
92                 else
93                 {
94                         $this->image_handler = false;
95                 }
96         }
98         public function set_registry(SimplePie_Registry $registry)
99         {
100                 $this->registry = $registry;
101         }
103         public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
104         {
105                 if (isset($enable_cache))
106                 {
107                         $this->enable_cache = (bool) $enable_cache;
108                 }
110                 if ($cache_location)
111                 {
112                         $this->cache_location = (string) $cache_location;
113                 }
115                 if ($cache_name_function)
116                 {
117                         $this->cache_name_function = (string) $cache_name_function;
118                 }
119         }
121         public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
122         {
123                 if ($timeout)
124                 {
125                         $this->timeout = (string) $timeout;
126                 }
128                 if ($useragent)
129                 {
130                         $this->useragent = (string) $useragent;
131                 }
133                 if ($force_fsockopen)
134                 {
135                         $this->force_fsockopen = (string) $force_fsockopen;
136                 }
137         }
139         public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
140         {
141                 if ($tags)
142                 {
143                         if (is_array($tags))
144                         {
145                                 $this->strip_htmltags = $tags;
146                         }
147                         else
148                         {
149                                 $this->strip_htmltags = explode(',', $tags);
150                         }
151                 }
152                 else
153                 {
154                         $this->strip_htmltags = false;
155                 }
156         }
158         public function encode_instead_of_strip($encode = false)
159         {
160                 $this->encode_instead_of_strip = (bool) $encode;
161         }
163         public function strip_attributes($attribs = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
164         {
165                 if ($attribs)
166                 {
167                         if (is_array($attribs))
168                         {
169                                 $this->strip_attributes = $attribs;
170                         }
171                         else
172                         {
173                                 $this->strip_attributes = explode(',', $attribs);
174                         }
175                 }
176                 else
177                 {
178                         $this->strip_attributes = false;
179                 }
180         }
182         public function add_attributes($attribs = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none')))
183         {
184                 if ($attribs)
185                 {
186                         if (is_array($attribs))
187                         {
188                                 $this->add_attributes = $attribs;
189                         }
190                         else
191                         {
192                                 $this->add_attributes = explode(',', $attribs);
193                         }
194                 }
195                 else
196                 {
197                         $this->add_attributes = false;
198                 }
199         }
201         public function strip_comments($strip = false)
202         {
203                 $this->strip_comments = (bool) $strip;
204         }
206         public function set_output_encoding($encoding = 'UTF-8')
207         {
208                 $this->output_encoding = (string) $encoding;
209         }
211         /**
212          * Set element/attribute key/value pairs of HTML attributes
213          * containing URLs that need to be resolved relative to the feed
214          *
215          * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
216          * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
217          * |q|@cite
218          *
219          * @since 1.0
220          * @param array|null $element_attribute Element/attribute key/value pairs, null for default
221          */
222         public function set_url_replacements($element_attribute = null)
223         {
224                 if ($element_attribute === null)
225                 {
226                         $element_attribute = array(
227                                 'a' => 'href',
228                                 'area' => 'href',
229                                 'blockquote' => 'cite',
230                                 'del' => 'cite',
231                                 'form' => 'action',
232                                 'img' => array(
233                                         'longdesc',
234                                         'src'
235                                 ),
236                                 'input' => 'src',
237                                 'ins' => 'cite',
238                                 'q' => 'cite'
239                         );
240                 }
241                 $this->replace_url_attributes = (array) $element_attribute;
242         }
244         public function sanitize($data, $type, $base = '')
245         {
246                 $data = trim($data);
247                 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
248                 {
249                         if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
250                         {
251                                 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
252                                 {
253                                         $type |= SIMPLEPIE_CONSTRUCT_HTML;
254                                 }
255                                 else
256                                 {
257                                         $type |= SIMPLEPIE_CONSTRUCT_TEXT;
258                                 }
259                         }
261                         if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
262                         {
263                                 $data = base64_decode($data);
264                         }
266                         if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
267                         {
269                                 if (!class_exists('DOMDocument'))
270                                 {
271                                         throw new SimplePie_Exception('DOMDocument not found, unable to use sanitizer');
272                                 }
273                                 $document = new DOMDocument();
274                                 $document->encoding = 'UTF-8';
276                                 // See https://github.com/simplepie/simplepie/issues/334
277                                 $unique_tag = '#'.uniqid().'#';
278                                 $data = trim($unique_tag . $data . $unique_tag);
280                                 $data = $this->preprocess($data, $type);
282                                 set_error_handler(array('SimplePie_Misc', 'silence_errors'));
283                                 $document->loadHTML($data);
284                                 restore_error_handler();
286                                 $xpath = new DOMXPath($document);
288                                 // Strip comments
289                                 if ($this->strip_comments)
290                                 {
291                                         $comments = $xpath->query('//comment()');
293                                         foreach ($comments as $comment)
294                                         {
295                                                 $comment->parentNode->removeChild($comment);
296                                         }
297                                 }
299                                 // Strip out HTML tags and attributes that might cause various security problems.
300                                 // Based on recommendations by Mark Pilgrim at:
301                                 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
302                                 if ($this->strip_htmltags)
303                                 {
304                                         foreach ($this->strip_htmltags as $tag)
305                                         {
306                                                 $this->strip_tag($tag, $document, $xpath, $type);
307                                         }
308                                 }
310                                 if ($this->strip_attributes)
311                                 {
312                                         foreach ($this->strip_attributes as $attrib)
313                                         {
314                                                 $this->strip_attr($attrib, $xpath);
315                                         }
316                                 }
318                                 if ($this->add_attributes)
319                                 {
320                                         foreach ($this->add_attributes as $tag => $valuePairs)
321                                         {
322                                                 $this->add_attr($tag, $valuePairs, $document);
323                                         }
324                                 }
326                                 // Replace relative URLs
327                                 $this->base = $base;
328                                 foreach ($this->replace_url_attributes as $element => $attributes)
329                                 {
330                                         $this->replace_urls($document, $element, $attributes);
331                                 }
333                                 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
334                                 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
335                                 {
336                                         $images = $document->getElementsByTagName('img');
337                                         foreach ($images as $img)
338                                         {
339                                                 if ($img->hasAttribute('src'))
340                                                 {
341                                                         $image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
342                                                         $cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
344                                                         if ($cache->load())
345                                                         {
346                                                                 $img->setAttribute('src', $this->image_handler . $image_url);
347                                                         }
348                                                         else
349                                                         {
350                                                                 $file = $this->registry->create('File', array($img->getAttribute('src'), $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
351                                                                 $headers = $file->headers;
353                                                                 if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
354                                                                 {
355                                                                         if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
356                                                                         {
357                                                                                 $img->setAttribute('src', $this->image_handler . $image_url);
358                                                                         }
359                                                                         else
360                                                                         {
361                                                                                 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
362                                                                         }
363                                                                 }
364                                                         }
365                                                 }
366                                         }
367                                 }
369                                 // Finally, convert to a HTML string
370                                 $data = trim($document->saveHTML());
371                                 $result = explode($unique_tag, $data);
372                                 // The tags may not be found again if there was invalid markup.
373                                 $data = count($result) === 3 ? $result[1] : '';
375                                 if ($this->remove_div)
376                                 {
377                                         $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
378                                         $data = preg_replace('/<\/div>$/', '', $data);
379                                 }
380                                 else
381                                 {
382                                         $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
383                                 }
384                         }
386                         if ($type & SIMPLEPIE_CONSTRUCT_IRI)
387                         {
388                                 $absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
389                                 if ($absolute !== false)
390                                 {
391                                         $data = $absolute;
392                                 }
393                         }
395                         if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
396                         {
397                                 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
398                         }
400                         if ($this->output_encoding !== 'UTF-8')
401                         {
402                                 $data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
403                         }
404                 }
405                 return $data;
406         }
408         protected function preprocess($html, $type)
409         {
410                 $ret = '';
411                 $html = preg_replace('%</?(?:html|body)[^>]*?'.'>%is', '', $html);
412                 if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
413                 {
414                         // Atom XHTML constructs are wrapped with a div by default
415                         // Note: No protection if $html contains a stray </div>!
416                         $html = '<div>' . $html . '</div>';
417                         $ret .= '<!DOCTYPE html>';
418                         $content_type = 'text/html';
419                 }
420                 else
421                 {
422                         $ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
423                         $content_type = 'application/xhtml+xml';
424                 }
426                 $ret .= '<html><head>';
427                 $ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
428                 $ret .= '</head><body>' . $html . '</body></html>';
429                 return $ret;
430         }
432         public function replace_urls($document, $tag, $attributes)
433         {
434                 if (!is_array($attributes))
435                 {
436                         $attributes = array($attributes);
437                 }
439                 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
440                 {
441                         $elements = $document->getElementsByTagName($tag);
442                         foreach ($elements as $element)
443                         {
444                                 foreach ($attributes as $attribute)
445                                 {
446                                         if ($element->hasAttribute($attribute))
447                                         {
448                                                 $value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
449                                                 if ($value !== false)
450                                                 {
451                                                         $element->setAttribute($attribute, $value);
452                                                 }
453                                         }
454                                 }
455                         }
456                 }
457         }
459         public function do_strip_htmltags($match)
460         {
461                 if ($this->encode_instead_of_strip)
462                 {
463                         if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
464                         {
465                                 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
466                                 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
467                                 return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
468                         }
469                         else
470                         {
471                                 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
472                         }
473                 }
474                 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
475                 {
476                         return $match[4];
477                 }
478                 else
479                 {
480                         return '';
481                 }
482         }
484         protected function strip_tag($tag, $document, $xpath, $type)
485         {
486                 $elements = $xpath->query('body//' . $tag);
487                 if ($this->encode_instead_of_strip)
488                 {
489                         foreach ($elements as $element)
490                         {
491                                 $fragment = $document->createDocumentFragment();
493                                 // For elements which aren't script or style, include the tag itself
494                                 if (!in_array($tag, array('script', 'style')))
495                                 {
496                                         $text = '<' . $tag;
497                                         if ($element->hasAttributes())
498                                         {
499                                                 $attrs = array();
500                                                 foreach ($element->attributes as $name => $attr)
501                                                 {
502                                                         $value = $attr->value;
504                                                         // In XHTML, empty values should never exist, so we repeat the value
505                                                         if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
506                                                         {
507                                                                 $value = $name;
508                                                         }
509                                                         // For HTML, empty is fine
510                                                         elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
511                                                         {
512                                                                 $attrs[] = $name;
513                                                                 continue;
514                                                         }
516                                                         // Standard attribute text
517                                                         $attrs[] = $name . '="' . $attr->value . '"';
518                                                 }
519                                                 $text .= ' ' . implode(' ', $attrs);
520                                         }
521                                         $text .= '>';
522                                         $fragment->appendChild(new DOMText($text));
523                                 }
525                                 $number = $element->childNodes->length;
526                                 for ($i = $number; $i > 0; $i--)
527                                 {
528                                         $child = $element->childNodes->item(0);
529                                         $fragment->appendChild($child);
530                                 }
532                                 if (!in_array($tag, array('script', 'style')))
533                                 {
534                                         $fragment->appendChild(new DOMText('</' . $tag . '>'));
535                                 }
537                                 $element->parentNode->replaceChild($fragment, $element);
538                         }
540                         return;
541                 }
542                 elseif (in_array($tag, array('script', 'style')))
543                 {
544                         foreach ($elements as $element)
545                         {
546                                 $element->parentNode->removeChild($element);
547                         }
549                         return;
550                 }
551                 else
552                 {
553                         foreach ($elements as $element)
554                         {
555                                 $fragment = $document->createDocumentFragment();
556                                 $number = $element->childNodes->length;
557                                 for ($i = $number; $i > 0; $i--)
558                                 {
559                                         $child = $element->childNodes->item(0);
560                                         $fragment->appendChild($child);
561                                 }
563                                 $element->parentNode->replaceChild($fragment, $element);
564                         }
565                 }
566         }
568         protected function strip_attr($attrib, $xpath)
569         {
570                 $elements = $xpath->query('//*[@' . $attrib . ']');
572                 foreach ($elements as $element)
573                 {
574                         $element->removeAttribute($attrib);
575                 }
576         }
578         protected function add_attr($tag, $valuePairs, $document)
579         {
580                 $elements = $document->getElementsByTagName($tag);
581                 foreach ($elements as $element)
582                 {
583                         foreach ($valuePairs as $attrib => $value)
584                         {
585                                 $element->setAttribute($attrib, $value);
586                         }
587                 }
588         }