6810cc49ff119251637b9235aa08dbe96d29a6ed
[moodle.git] / lib / simplepie / library / SimplePie / Sanitize.php
1 <?php
2 /**
3  * SimplePie
4  *
5  * A PHP-Based RSS and Atom Feed Framework.
6  * Takes the hard work out of managing a complete RSS/Atom solution.
7  *
8  * Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without modification, are
12  * permitted provided that the following conditions are met:
13  *
14  *      * Redistributions of source code must retain the above copyright notice, this list of
15  *        conditions and the following disclaimer.
16  *
17  *      * Redistributions in binary form must reproduce the above copyright notice, this list
18  *        of conditions and the following disclaimer in the documentation and/or other materials
19  *        provided with the distribution.
20  *
21  *      * Neither the name of the SimplePie Team nor the names of its contributors may be used
22  *        to endorse or promote products derived from this software without specific prior
23  *        written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28  * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  * POSSIBILITY OF SUCH DAMAGE.
34  *
35  * @package SimplePie
36  * @version 1.3.1
37  * @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
38  * @author Ryan Parman
39  * @author Geoffrey Sneddon
40  * @author Ryan McCue
41  * @link http://simplepie.org/ SimplePie
42  * @license http://www.opensource.org/licenses/bsd-license.php BSD License
43  */
45 /**
46  * Used for data cleanup and post-processing
47  *
48  *
49  * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
50  *
51  * @package SimplePie
52  * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
53  */
54 class SimplePie_Sanitize
55 {
56         // Private vars
57         var $base;
59         // Options
60         var $remove_div = true;
61         var $image_handler = '';
62         var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
63         var $encode_instead_of_strip = false;
64         var $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
65         var $strip_comments = false;
66         var $output_encoding = 'UTF-8';
67         var $enable_cache = true;
68         var $cache_location = './cache';
69         var $cache_name_function = 'md5';
70         var $timeout = 10;
71         var $useragent = '';
72         var $force_fsockopen = false;
73         var $replace_url_attributes = null;
75         public function __construct()
76         {
77                 // Set defaults
78                 $this->set_url_replacements(null);
79         }
81         public function remove_div($enable = true)
82         {
83                 $this->remove_div = (bool) $enable;
84         }
86         public function set_image_handler($page = false)
87         {
88                 if ($page)
89                 {
90                         $this->image_handler = (string) $page;
91                 }
92                 else
93                 {
94                         $this->image_handler = false;
95                 }
96         }
98         public function set_registry(SimplePie_Registry $registry)
99         {
100                 $this->registry = $registry;
101         }
103         public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
104         {
105                 if (isset($enable_cache))
106                 {
107                         $this->enable_cache = (bool) $enable_cache;
108                 }
110                 if ($cache_location)
111                 {
112                         $this->cache_location = (string) $cache_location;
113                 }
115                 if ($cache_name_function)
116                 {
117                         $this->cache_name_function = (string) $cache_name_function;
118                 }
119         }
121         public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
122         {
123                 if ($timeout)
124                 {
125                         $this->timeout = (string) $timeout;
126                 }
128                 if ($useragent)
129                 {
130                         $this->useragent = (string) $useragent;
131                 }
133                 if ($force_fsockopen)
134                 {
135                         $this->force_fsockopen = (string) $force_fsockopen;
136                 }
137         }
139         public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
140         {
141                 if ($tags)
142                 {
143                         if (is_array($tags))
144                         {
145                                 $this->strip_htmltags = $tags;
146                         }
147                         else
148                         {
149                                 $this->strip_htmltags = explode(',', $tags);
150                         }
151                 }
152                 else
153                 {
154                         $this->strip_htmltags = false;
155                 }
156         }
158         public function encode_instead_of_strip($encode = false)
159         {
160                 $this->encode_instead_of_strip = (bool) $encode;
161         }
163         public function strip_attributes($attribs = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
164         {
165                 if ($attribs)
166                 {
167                         if (is_array($attribs))
168                         {
169                                 $this->strip_attributes = $attribs;
170                         }
171                         else
172                         {
173                                 $this->strip_attributes = explode(',', $attribs);
174                         }
175                 }
176                 else
177                 {
178                         $this->strip_attributes = false;
179                 }
180         }
182         public function strip_comments($strip = false)
183         {
184                 $this->strip_comments = (bool) $strip;
185         }
187         public function set_output_encoding($encoding = 'UTF-8')
188         {
189                 $this->output_encoding = (string) $encoding;
190         }
192         /**
193          * Set element/attribute key/value pairs of HTML attributes
194          * containing URLs that need to be resolved relative to the feed
195          *
196          * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
197          * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
198          * |q|@cite
199          *
200          * @since 1.0
201          * @param array|null $element_attribute Element/attribute key/value pairs, null for default
202          */
203         public function set_url_replacements($element_attribute = null)
204         {
205                 if ($element_attribute === null)
206                 {
207                         $element_attribute = array(
208                                 'a' => 'href',
209                                 'area' => 'href',
210                                 'blockquote' => 'cite',
211                                 'del' => 'cite',
212                                 'form' => 'action',
213                                 'img' => array(
214                                         'longdesc',
215                                         'src'
216                                 ),
217                                 'input' => 'src',
218                                 'ins' => 'cite',
219                                 'q' => 'cite'
220                         );
221                 }
222                 $this->replace_url_attributes = (array) $element_attribute;
223         }
225         public function sanitize($data, $type, $base = '')
226         {
227                 $data = trim($data);
228                 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
229                 {
230                         if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
231                         {
232                                 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
233                                 {
234                                         $type |= SIMPLEPIE_CONSTRUCT_HTML;
235                                 }
236                                 else
237                                 {
238                                         $type |= SIMPLEPIE_CONSTRUCT_TEXT;
239                                 }
240                         }
242                         if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
243                         {
244                                 $data = base64_decode($data);
245                         }
247                         if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
248                         {
250                                 $document = new DOMDocument();
251                                 $document->encoding = 'UTF-8';
252                                 $data = $this->preprocess($data, $type);
254                                 set_error_handler(array('SimplePie_Misc', 'silence_errors'));
255                                 $document->loadHTML($data);
256                                 restore_error_handler();
258                                 // Strip comments
259                                 if ($this->strip_comments)
260                                 {
261                                         $xpath = new DOMXPath($document);
262                                         $comments = $xpath->query('//comment()');
264                                         foreach ($comments as $comment)
265                                         {
266                                                 $comment->parentNode->removeChild($comment);
267                                         }
268                                 }
270                                 // Strip out HTML tags and attributes that might cause various security problems.
271                                 // Based on recommendations by Mark Pilgrim at:
272                                 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
273                                 if ($this->strip_htmltags)
274                                 {
275                                         foreach ($this->strip_htmltags as $tag)
276                                         {
277                                                 $this->strip_tag($tag, $document, $type);
278                                         }
279                                 }
281                                 if ($this->strip_attributes)
282                                 {
283                                         foreach ($this->strip_attributes as $attrib)
284                                         {
285                                                 $this->strip_attr($attrib, $document);
286                                         }
287                                 }
289                                 // Replace relative URLs
290                                 $this->base = $base;
291                                 foreach ($this->replace_url_attributes as $element => $attributes)
292                                 {
293                                         $this->replace_urls($document, $element, $attributes);
294                                 }
296                                 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
297                                 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
298                                 {
299                                         $images = $document->getElementsByTagName('img');
300                                         foreach ($images as $img)
301                                         {
302                                                 if ($img->hasAttribute('src'))
303                                                 {
304                                                         $image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
305                                                         $cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
307                                                         if ($cache->load())
308                                                         {
309                                                                 $img->setAttribute('src', $this->image_handler . $image_url);
310                                                         }
311                                                         else
312                                                         {
313                                                                 $file = $this->registry->create('File', array($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
314                                                                 $headers = $file->headers;
316                                                                 if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
317                                                                 {
318                                                                         if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
319                                                                         {
320                                                                                 $img->setAttribute('src', $this->image_handler . $image_url);
321                                                                         }
322                                                                         else
323                                                                         {
324                                                                                 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
325                                                                         }
326                                                                 }
327                                                         }
328                                                 }
329                                         }
330                                 }
332                                 // Remove the DOCTYPE
333                                 // Seems to cause segfaulting if we don't do this
334                                 if ($document->firstChild instanceof DOMDocumentType)
335                                 {
336                                         $document->removeChild($document->firstChild);
337                                 }
339                                 // Move everything from the body to the root
340                                 $real_body = $document->getElementsByTagName('body')->item(0)->childNodes->item(0);
341                                 $document->replaceChild($real_body, $document->firstChild);
343                                 // Finally, convert to a HTML string
344                                 $data = trim($document->saveHTML());
346                                 if ($this->remove_div)
347                                 {
348                                         $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
349                                         $data = preg_replace('/<\/div>$/', '', $data);
350                                 }
351                                 else
352                                 {
353                                         $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
354                                 }
355                         }
357                         if ($type & SIMPLEPIE_CONSTRUCT_IRI)
358                         {
359                                 $absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
360                                 if ($absolute !== false)
361                                 {
362                                         $data = $absolute;
363                                 }
364                         }
366                         if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
367                         {
368                                 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
369                         }
371                         if ($this->output_encoding !== 'UTF-8')
372                         {
373                                 $data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
374                         }
375                 }
376                 return $data;
377         }
379         protected function preprocess($html, $type)
380         {
381                 $ret = '';
382                 if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
383                 {
384                         // Atom XHTML constructs are wrapped with a div by default
385                         // Note: No protection if $html contains a stray </div>!
386                         $html = '<div>' . $html . '</div>';
387                         $ret .= '<!DOCTYPE html>';
388                         $content_type = 'text/html';
389                 }
390                 else
391                 {
392                         $ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
393                         $content_type = 'application/xhtml+xml';
394                 }
396                 $ret .= '<html><head>';
397                 $ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
398                 $ret .= '</head><body>' . $html . '</body></html>';
399                 return $ret;
400         }
402         public function replace_urls($document, $tag, $attributes)
403         {
404                 if (!is_array($attributes))
405                 {
406                         $attributes = array($attributes);
407                 }
409                 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
410                 {
411                         $elements = $document->getElementsByTagName($tag);
412                         foreach ($elements as $element)
413                         {
414                                 foreach ($attributes as $attribute)
415                                 {
416                                         if ($element->hasAttribute($attribute))
417                                         {
418                                                 $value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
419                                                 if ($value !== false)
420                                                 {
421                                                         $element->setAttribute($attribute, $value);
422                                                 }
423                                         }
424                                 }
425                         }
426                 }
427         }
429         public function do_strip_htmltags($match)
430         {
431                 if ($this->encode_instead_of_strip)
432                 {
433                         if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
434                         {
435                                 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
436                                 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
437                                 return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
438                         }
439                         else
440                         {
441                                 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
442                         }
443                 }
444                 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
445                 {
446                         return $match[4];
447                 }
448                 else
449                 {
450                         return '';
451                 }
452         }
454         protected function strip_tag($tag, $document, $type)
455         {
456                 $xpath = new DOMXPath($document);
457                 $elements = $xpath->query('body//' . $tag);
458                 if ($this->encode_instead_of_strip)
459                 {
460                         foreach ($elements as $element)
461                         {
462                                 $fragment = $document->createDocumentFragment();
464                                 // For elements which aren't script or style, include the tag itself
465                                 if (!in_array($tag, array('script', 'style')))
466                                 {
467                                         $text = '<' . $tag;
468                                         if ($element->hasAttributes())
469                                         {
470                                                 $attrs = array();
471                                                 foreach ($element->attributes as $name => $attr)
472                                                 {
473                                                         $value = $attr->value;
475                                                         // In XHTML, empty values should never exist, so we repeat the value
476                                                         if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
477                                                         {
478                                                                 $value = $name;
479                                                         }
480                                                         // For HTML, empty is fine
481                                                         elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
482                                                         {
483                                                                 $attrs[] = $name;
484                                                                 continue;
485                                                         }
487                                                         // Standard attribute text
488                                                         $attrs[] = $name . '="' . $attr->value . '"';
489                                                 }
490                                                 $text .= ' ' . implode(' ', $attrs);
491                                         }
492                                         $text .= '>';
493                                         $fragment->appendChild(new DOMText($text));
494                                 }
496                                 $number = $element->childNodes->length;
497                                 for ($i = $number; $i > 0; $i--)
498                                 {
499                                         $child = $element->childNodes->item(0);
500                                         $fragment->appendChild($child);
501                                 }
503                                 if (!in_array($tag, array('script', 'style')))
504                                 {
505                                         $fragment->appendChild(new DOMText('</' . $tag . '>'));
506                                 }
508                                 $element->parentNode->replaceChild($fragment, $element);
509                         }
511                         return;
512                 }
513                 elseif (in_array($tag, array('script', 'style')))
514                 {
515                         foreach ($elements as $element)
516                         {
517                                 $element->parentNode->removeChild($element);
518                         }
520                         return;
521                 }
522                 else
523                 {
524                         foreach ($elements as $element)
525                         {
526                                 $fragment = $document->createDocumentFragment();
527                                 $number = $element->childNodes->length;
528                                 for ($i = $number; $i > 0; $i--)
529                                 {
530                                         $child = $element->childNodes->item(0);
531                                         $fragment->appendChild($child);
532                                 }
534                                 $element->parentNode->replaceChild($fragment, $element);
535                         }
536                 }
537         }
539         protected function strip_attr($attrib, $document)
540         {
541                 $xpath = new DOMXPath($document);
542                 $elements = $xpath->query('//*[@' . $attrib . ']');
544                 foreach ($elements as $element)
545                 {
546                         $element->removeAttribute($attrib);
547                 }
548         }