MDL-60236 libraries: Upgrade simplepie to 1.5.0
[moodle.git] / lib / simplepie / library / SimplePie / Sanitize.php
1 <?php
2 /**
3  * SimplePie
4  *
5  * A PHP-Based RSS and Atom Feed Framework.
6  * Takes the hard work out of managing a complete RSS/Atom solution.
7  *
8  * Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without modification, are
12  * permitted provided that the following conditions are met:
13  *
14  *      * Redistributions of source code must retain the above copyright notice, this list of
15  *        conditions and the following disclaimer.
16  *
17  *      * Redistributions in binary form must reproduce the above copyright notice, this list
18  *        of conditions and the following disclaimer in the documentation and/or other materials
19  *        provided with the distribution.
20  *
21  *      * Neither the name of the SimplePie Team nor the names of its contributors may be used
22  *        to endorse or promote products derived from this software without specific prior
23  *        written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28  * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  * POSSIBILITY OF SUCH DAMAGE.
34  *
35  * @package SimplePie
36  * @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
37  * @author Ryan Parman
38  * @author Geoffrey Sneddon
39  * @author Ryan McCue
40  * @link http://simplepie.org/ SimplePie
41  * @license http://www.opensource.org/licenses/bsd-license.php BSD License
42  */
44 /**
45  * Used for data cleanup and post-processing
46  *
47  *
48  * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
49  *
50  * @package SimplePie
51  * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
52  */
53 class SimplePie_Sanitize
54 {
55         // Private vars
56         var $base;
58         // Options
59         var $remove_div = true;
60         var $image_handler = '';
61         var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
62         var $encode_instead_of_strip = false;
63         var $strip_attributes = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
64         var $add_attributes = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none'));
65         var $strip_comments = false;
66         var $output_encoding = 'UTF-8';
67         var $enable_cache = true;
68         var $cache_location = './cache';
69         var $cache_name_function = 'md5';
70         var $timeout = 10;
71         var $useragent = '';
72         var $force_fsockopen = false;
73         var $replace_url_attributes = null;
75         public function __construct()
76         {
77                 // Set defaults
78                 $this->set_url_replacements(null);
79         }
81         public function remove_div($enable = true)
82         {
83                 $this->remove_div = (bool) $enable;
84         }
86         public function set_image_handler($page = false)
87         {
88                 if ($page)
89                 {
90                         $this->image_handler = (string) $page;
91                 }
92                 else
93                 {
94                         $this->image_handler = false;
95                 }
96         }
98         public function set_registry(SimplePie_Registry $registry)
99         {
100                 $this->registry = $registry;
101         }
103         public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
104         {
105                 if (isset($enable_cache))
106                 {
107                         $this->enable_cache = (bool) $enable_cache;
108                 }
110                 if ($cache_location)
111                 {
112                         $this->cache_location = (string) $cache_location;
113                 }
115                 if ($cache_name_function)
116                 {
117                         $this->cache_name_function = (string) $cache_name_function;
118                 }
119         }
121         public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
122         {
123                 if ($timeout)
124                 {
125                         $this->timeout = (string) $timeout;
126                 }
128                 if ($useragent)
129                 {
130                         $this->useragent = (string) $useragent;
131                 }
133                 if ($force_fsockopen)
134                 {
135                         $this->force_fsockopen = (string) $force_fsockopen;
136                 }
137         }
139         public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
140         {
141                 if ($tags)
142                 {
143                         if (is_array($tags))
144                         {
145                                 $this->strip_htmltags = $tags;
146                         }
147                         else
148                         {
149                                 $this->strip_htmltags = explode(',', $tags);
150                         }
151                 }
152                 else
153                 {
154                         $this->strip_htmltags = false;
155                 }
156         }
158         public function encode_instead_of_strip($encode = false)
159         {
160                 $this->encode_instead_of_strip = (bool) $encode;
161         }
163         public function strip_attributes($attribs = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
164         {
165                 if ($attribs)
166                 {
167                         if (is_array($attribs))
168                         {
169                                 $this->strip_attributes = $attribs;
170                         }
171                         else
172                         {
173                                 $this->strip_attributes = explode(',', $attribs);
174                         }
175                 }
176                 else
177                 {
178                         $this->strip_attributes = false;
179                 }
180         }
182         public function add_attributes($attribs = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none')))
183         {
184                 if ($attribs)
185                 {
186                         if (is_array($attribs))
187                         {
188                                 $this->add_attributes = $attribs;
189                         }
190                         else
191                         {
192                                 $this->add_attributes = explode(',', $attribs);
193                         }
194                 }
195                 else
196                 {
197                         $this->add_attributes = false;
198                 }
199         }
201         public function strip_comments($strip = false)
202         {
203                 $this->strip_comments = (bool) $strip;
204         }
206         public function set_output_encoding($encoding = 'UTF-8')
207         {
208                 $this->output_encoding = (string) $encoding;
209         }
211         /**
212          * Set element/attribute key/value pairs of HTML attributes
213          * containing URLs that need to be resolved relative to the feed
214          *
215          * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
216          * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
217          * |q|@cite
218          *
219          * @since 1.0
220          * @param array|null $element_attribute Element/attribute key/value pairs, null for default
221          */
222         public function set_url_replacements($element_attribute = null)
223         {
224                 if ($element_attribute === null)
225                 {
226                         $element_attribute = array(
227                                 'a' => 'href',
228                                 'area' => 'href',
229                                 'blockquote' => 'cite',
230                                 'del' => 'cite',
231                                 'form' => 'action',
232                                 'img' => array(
233                                         'longdesc',
234                                         'src'
235                                 ),
236                                 'input' => 'src',
237                                 'ins' => 'cite',
238                                 'q' => 'cite'
239                         );
240                 }
241                 $this->replace_url_attributes = (array) $element_attribute;
242         }
244         public function sanitize($data, $type, $base = '')
245         {
246                 $data = trim($data);
247                 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
248                 {
249                         if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
250                         {
251                                 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
252                                 {
253                                         $type |= SIMPLEPIE_CONSTRUCT_HTML;
254                                 }
255                                 else
256                                 {
257                                         $type |= SIMPLEPIE_CONSTRUCT_TEXT;
258                                 }
259                         }
261                         if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
262                         {
263                                 $data = base64_decode($data);
264                         }
266                         if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
267                         {
269                                 if (!class_exists('DOMDocument'))
270                                 {
271                                         throw new SimplePie_Exception('DOMDocument not found, unable to use sanitizer');
272                                 }
273                                 $document = new DOMDocument();
274                                 $document->encoding = 'UTF-8';
276                                 $data = $this->preprocess($data, $type);
278                                 set_error_handler(array('SimplePie_Misc', 'silence_errors'));
279                                 $document->loadHTML($data);
280                                 restore_error_handler();
282                                 $xpath = new DOMXPath($document);
284                                 // Strip comments
285                                 if ($this->strip_comments)
286                                 {
287                                         $comments = $xpath->query('//comment()');
289                                         foreach ($comments as $comment)
290                                         {
291                                                 $comment->parentNode->removeChild($comment);
292                                         }
293                                 }
295                                 // Strip out HTML tags and attributes that might cause various security problems.
296                                 // Based on recommendations by Mark Pilgrim at:
297                                 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
298                                 if ($this->strip_htmltags)
299                                 {
300                                         foreach ($this->strip_htmltags as $tag)
301                                         {
302                                                 $this->strip_tag($tag, $document, $xpath, $type);
303                                         }
304                                 }
306                                 if ($this->strip_attributes)
307                                 {
308                                         foreach ($this->strip_attributes as $attrib)
309                                         {
310                                                 $this->strip_attr($attrib, $xpath);
311                                         }
312                                 }
314                                 if ($this->add_attributes)
315                                 {
316                                         foreach ($this->add_attributes as $tag => $valuePairs)
317                                         {
318                                                 $this->add_attr($tag, $valuePairs, $document);
319                                         }
320                                 }
322                                 // Replace relative URLs
323                                 $this->base = $base;
324                                 foreach ($this->replace_url_attributes as $element => $attributes)
325                                 {
326                                         $this->replace_urls($document, $element, $attributes);
327                                 }
329                                 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
330                                 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
331                                 {
332                                         $images = $document->getElementsByTagName('img');
333                                         foreach ($images as $img)
334                                         {
335                                                 if ($img->hasAttribute('src'))
336                                                 {
337                                                         $image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
338                                                         $cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
340                                                         if ($cache->load())
341                                                         {
342                                                                 $img->setAttribute('src', $this->image_handler . $image_url);
343                                                         }
344                                                         else
345                                                         {
346                                                                 $file = $this->registry->create('File', array($img->getAttribute('src'), $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
347                                                                 $headers = $file->headers;
349                                                                 if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
350                                                                 {
351                                                                         if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
352                                                                         {
353                                                                                 $img->setAttribute('src', $this->image_handler . $image_url);
354                                                                         }
355                                                                         else
356                                                                         {
357                                                                                 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
358                                                                         }
359                                                                 }
360                                                         }
361                                                 }
362                                         }
363                                 }
365                                 // Get content node
366                                 $div = $document->getElementsByTagName('body')->item(0)->firstChild;
367                                 // Finally, convert to a HTML string
368                                 if (version_compare(PHP_VERSION, '5.3.6', '>='))
369                                 {
370                                         $data = trim($document->saveHTML($div));
371                                 }
372                                 else
373                                 {
374                                         $data = trim($document->saveXML($div));
375                                 }
377                                 if ($this->remove_div)
378                                 {
379                                         $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
380                                         $data = preg_replace('/<\/div>$/', '', $data);
381                                 }
382                                 else
383                                 {
384                                         $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
385                                 }
386                         }
388                         if ($type & SIMPLEPIE_CONSTRUCT_IRI)
389                         {
390                                 $absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
391                                 if ($absolute !== false)
392                                 {
393                                         $data = $absolute;
394                                 }
395                         }
397                         if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
398                         {
399                                 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
400                         }
402                         if ($this->output_encoding !== 'UTF-8')
403                         {
404                                 $data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
405                         }
406                 }
407                 return $data;
408         }
410         protected function preprocess($html, $type)
411         {
412                 $ret = '';
413                 $html = preg_replace('%</?(?:html|body)[^>]*?'.'>%is', '', $html);
414                 if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
415                 {
416                         // Atom XHTML constructs are wrapped with a div by default
417                         // Note: No protection if $html contains a stray </div>!
418                         $html = '<div>' . $html . '</div>';
419                         $ret .= '<!DOCTYPE html>';
420                         $content_type = 'text/html';
421                 }
422                 else
423                 {
424                         $ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
425                         $content_type = 'application/xhtml+xml';
426                 }
428                 $ret .= '<html><head>';
429                 $ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
430                 $ret .= '</head><body>' . $html . '</body></html>';
431                 return $ret;
432         }
434         public function replace_urls($document, $tag, $attributes)
435         {
436                 if (!is_array($attributes))
437                 {
438                         $attributes = array($attributes);
439                 }
441                 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
442                 {
443                         $elements = $document->getElementsByTagName($tag);
444                         foreach ($elements as $element)
445                         {
446                                 foreach ($attributes as $attribute)
447                                 {
448                                         if ($element->hasAttribute($attribute))
449                                         {
450                                                 $value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
451                                                 if ($value !== false)
452                                                 {
453                                                         $element->setAttribute($attribute, $value);
454                                                 }
455                                         }
456                                 }
457                         }
458                 }
459         }
461         public function do_strip_htmltags($match)
462         {
463                 if ($this->encode_instead_of_strip)
464                 {
465                         if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
466                         {
467                                 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
468                                 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
469                                 return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
470                         }
471                         else
472                         {
473                                 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
474                         }
475                 }
476                 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
477                 {
478                         return $match[4];
479                 }
480                 else
481                 {
482                         return '';
483                 }
484         }
486         protected function strip_tag($tag, $document, $xpath, $type)
487         {
488                 $elements = $xpath->query('body//' . $tag);
489                 if ($this->encode_instead_of_strip)
490                 {
491                         foreach ($elements as $element)
492                         {
493                                 $fragment = $document->createDocumentFragment();
495                                 // For elements which aren't script or style, include the tag itself
496                                 if (!in_array($tag, array('script', 'style')))
497                                 {
498                                         $text = '<' . $tag;
499                                         if ($element->hasAttributes())
500                                         {
501                                                 $attrs = array();
502                                                 foreach ($element->attributes as $name => $attr)
503                                                 {
504                                                         $value = $attr->value;
506                                                         // In XHTML, empty values should never exist, so we repeat the value
507                                                         if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
508                                                         {
509                                                                 $value = $name;
510                                                         }
511                                                         // For HTML, empty is fine
512                                                         elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
513                                                         {
514                                                                 $attrs[] = $name;
515                                                                 continue;
516                                                         }
518                                                         // Standard attribute text
519                                                         $attrs[] = $name . '="' . $attr->value . '"';
520                                                 }
521                                                 $text .= ' ' . implode(' ', $attrs);
522                                         }
523                                         $text .= '>';
524                                         $fragment->appendChild(new DOMText($text));
525                                 }
527                                 $number = $element->childNodes->length;
528                                 for ($i = $number; $i > 0; $i--)
529                                 {
530                                         $child = $element->childNodes->item(0);
531                                         $fragment->appendChild($child);
532                                 }
534                                 if (!in_array($tag, array('script', 'style')))
535                                 {
536                                         $fragment->appendChild(new DOMText('</' . $tag . '>'));
537                                 }
539                                 $element->parentNode->replaceChild($fragment, $element);
540                         }
542                         return;
543                 }
544                 elseif (in_array($tag, array('script', 'style')))
545                 {
546                         foreach ($elements as $element)
547                         {
548                                 $element->parentNode->removeChild($element);
549                         }
551                         return;
552                 }
553                 else
554                 {
555                         foreach ($elements as $element)
556                         {
557                                 $fragment = $document->createDocumentFragment();
558                                 $number = $element->childNodes->length;
559                                 for ($i = $number; $i > 0; $i--)
560                                 {
561                                         $child = $element->childNodes->item(0);
562                                         $fragment->appendChild($child);
563                                 }
565                                 $element->parentNode->replaceChild($fragment, $element);
566                         }
567                 }
568         }
570         protected function strip_attr($attrib, $xpath)
571         {
572                 $elements = $xpath->query('//*[@' . $attrib . ']');
574                 foreach ($elements as $element)
575                 {
576                         $element->removeAttribute($attrib);
577                 }
578         }
580         protected function add_attr($tag, $valuePairs, $document)
581         {
582                 $elements = $document->getElementsByTagName($tag);
583                 foreach ($elements as $element)
584                 {
585                         foreach ($valuePairs as $attrib => $value)
586                         {
587                                 $element->setAttribute($attrib, $value);
588                         }
589                 }
590         }