5 * A PHP-Based RSS and Atom Feed Framework.
6 * Takes the hard work out of managing a complete RSS/Atom solution.
8 * Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
11 * Redistribution and use in source and binary forms, with or without modification, are
12 * permitted provided that the following conditions are met:
14 * * Redistributions of source code must retain the above copyright notice, this list of
15 * conditions and the following disclaimer.
17 * * Redistributions in binary form must reproduce the above copyright notice, this list
18 * of conditions and the following disclaimer in the documentation and/or other materials
19 * provided with the distribution.
21 * * Neither the name of the SimplePie Team nor the names of its contributors may be used
22 * to endorse or promote products derived from this software without specific prior
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
36 * @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
38 * @author Geoffrey Sneddon
40 * @link http://simplepie.org/ SimplePie
41 * @license http://www.opensource.org/licenses/bsd-license.php BSD License
45 * Used for data cleanup and post-processing
48 * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
51 * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
53 class SimplePie_Sanitize
59 var $remove_div = true;
60 var $image_handler = '';
61 var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
62 var $encode_instead_of_strip = false;
63 var $strip_attributes = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
64 var $add_attributes = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none'));
65 var $strip_comments = false;
66 var $output_encoding = 'UTF-8';
67 var $enable_cache = true;
68 var $cache_location = './cache';
69 var $cache_name_function = 'md5';
72 var $force_fsockopen = false;
73 var $replace_url_attributes = null;
75 public function __construct()
78 $this->set_url_replacements(null);
81 public function remove_div($enable = true)
83 $this->remove_div = (bool) $enable;
86 public function set_image_handler($page = false)
90 $this->image_handler = (string) $page;
94 $this->image_handler = false;
98 public function set_registry(SimplePie_Registry $registry)
100 $this->registry = $registry;
103 public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
105 if (isset($enable_cache))
107 $this->enable_cache = (bool) $enable_cache;
112 $this->cache_location = (string) $cache_location;
115 if ($cache_name_function)
117 $this->cache_name_function = (string) $cache_name_function;
121 public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
125 $this->timeout = (string) $timeout;
130 $this->useragent = (string) $useragent;
133 if ($force_fsockopen)
135 $this->force_fsockopen = (string) $force_fsockopen;
139 public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
145 $this->strip_htmltags = $tags;
149 $this->strip_htmltags = explode(',', $tags);
154 $this->strip_htmltags = false;
158 public function encode_instead_of_strip($encode = false)
160 $this->encode_instead_of_strip = (bool) $encode;
163 public function strip_attributes($attribs = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
167 if (is_array($attribs))
169 $this->strip_attributes = $attribs;
173 $this->strip_attributes = explode(',', $attribs);
178 $this->strip_attributes = false;
182 public function add_attributes($attribs = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none')))
186 if (is_array($attribs))
188 $this->add_attributes = $attribs;
192 $this->add_attributes = explode(',', $attribs);
197 $this->add_attributes = false;
201 public function strip_comments($strip = false)
203 $this->strip_comments = (bool) $strip;
206 public function set_output_encoding($encoding = 'UTF-8')
208 $this->output_encoding = (string) $encoding;
212 * Set element/attribute key/value pairs of HTML attributes
213 * containing URLs that need to be resolved relative to the feed
215 * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
216 * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
220 * @param array|null $element_attribute Element/attribute key/value pairs, null for default
222 public function set_url_replacements($element_attribute = null)
224 if ($element_attribute === null)
226 $element_attribute = array(
229 'blockquote' => 'cite',
241 $this->replace_url_attributes = (array) $element_attribute;
244 public function sanitize($data, $type, $base = '')
247 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
249 if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
251 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
253 $type |= SIMPLEPIE_CONSTRUCT_HTML;
257 $type |= SIMPLEPIE_CONSTRUCT_TEXT;
261 if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
263 $data = base64_decode($data);
266 if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
269 if (!class_exists('DOMDocument'))
271 throw new SimplePie_Exception('DOMDocument not found, unable to use sanitizer');
273 $document = new DOMDocument();
274 $document->encoding = 'UTF-8';
276 // See https://github.com/simplepie/simplepie/issues/334
277 $unique_tag = '#'.uniqid().'#';
278 $data = trim($unique_tag . $data . $unique_tag);
280 $data = $this->preprocess($data, $type);
282 set_error_handler(array('SimplePie_Misc', 'silence_errors'));
283 $document->loadHTML($data);
284 restore_error_handler();
286 $xpath = new DOMXPath($document);
289 if ($this->strip_comments)
291 $comments = $xpath->query('//comment()');
293 foreach ($comments as $comment)
295 $comment->parentNode->removeChild($comment);
299 // Strip out HTML tags and attributes that might cause various security problems.
300 // Based on recommendations by Mark Pilgrim at:
301 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
302 if ($this->strip_htmltags)
304 foreach ($this->strip_htmltags as $tag)
306 $this->strip_tag($tag, $document, $xpath, $type);
310 if ($this->strip_attributes)
312 foreach ($this->strip_attributes as $attrib)
314 $this->strip_attr($attrib, $xpath);
318 if ($this->add_attributes)
320 foreach ($this->add_attributes as $tag => $valuePairs)
322 $this->add_attr($tag, $valuePairs, $document);
326 // Replace relative URLs
328 foreach ($this->replace_url_attributes as $element => $attributes)
330 $this->replace_urls($document, $element, $attributes);
333 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
334 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
336 $images = $document->getElementsByTagName('img');
337 foreach ($images as $img)
339 if ($img->hasAttribute('src'))
341 $image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
342 $cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
346 $img->setAttribute('src', $this->image_handler . $image_url);
350 $file = $this->registry->create('File', array($img->getAttribute('src'), $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
351 $headers = $file->headers;
353 if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
355 if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
357 $img->setAttribute('src', $this->image_handler . $image_url);
361 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
369 // Finally, convert to a HTML string
370 $data = trim($document->saveHTML());
371 $result = explode($unique_tag, $data);
372 // The tags may not be found again if there was invalid markup.
373 $data = count($result) === 3 ? $result[1] : '';
375 if ($this->remove_div)
377 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
378 $data = preg_replace('/<\/div>$/', '', $data);
382 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
386 if ($type & SIMPLEPIE_CONSTRUCT_IRI)
388 $absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
389 if ($absolute !== false)
395 if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
397 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
400 if ($this->output_encoding !== 'UTF-8')
402 $data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
408 protected function preprocess($html, $type)
411 $html = preg_replace('%</?(?:html|body)[^>]*?'.'>%is', '', $html);
412 if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
414 // Atom XHTML constructs are wrapped with a div by default
415 // Note: No protection if $html contains a stray </div>!
416 $html = '<div>' . $html . '</div>';
417 $ret .= '<!DOCTYPE html>';
418 $content_type = 'text/html';
422 $ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
423 $content_type = 'application/xhtml+xml';
426 $ret .= '<html><head>';
427 $ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
428 $ret .= '</head><body>' . $html . '</body></html>';
432 public function replace_urls($document, $tag, $attributes)
434 if (!is_array($attributes))
436 $attributes = array($attributes);
439 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
441 $elements = $document->getElementsByTagName($tag);
442 foreach ($elements as $element)
444 foreach ($attributes as $attribute)
446 if ($element->hasAttribute($attribute))
448 $value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
449 if ($value !== false)
451 $element->setAttribute($attribute, $value);
459 public function do_strip_htmltags($match)
461 if ($this->encode_instead_of_strip)
463 if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
465 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
466 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
467 return "<$match[1]$match[2]>$match[3]</$match[1]>";
471 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
474 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
484 protected function strip_tag($tag, $document, $xpath, $type)
486 $elements = $xpath->query('body//' . $tag);
487 if ($this->encode_instead_of_strip)
489 foreach ($elements as $element)
491 $fragment = $document->createDocumentFragment();
493 // For elements which aren't script or style, include the tag itself
494 if (!in_array($tag, array('script', 'style')))
497 if ($element->hasAttributes())
500 foreach ($element->attributes as $name => $attr)
502 $value = $attr->value;
504 // In XHTML, empty values should never exist, so we repeat the value
505 if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
509 // For HTML, empty is fine
510 elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
516 // Standard attribute text
517 $attrs[] = $name . '="' . $attr->value . '"';
519 $text .= ' ' . implode(' ', $attrs);
522 $fragment->appendChild(new DOMText($text));
525 $number = $element->childNodes->length;
526 for ($i = $number; $i > 0; $i--)
528 $child = $element->childNodes->item(0);
529 $fragment->appendChild($child);
532 if (!in_array($tag, array('script', 'style')))
534 $fragment->appendChild(new DOMText('</' . $tag . '>'));
537 $element->parentNode->replaceChild($fragment, $element);
542 elseif (in_array($tag, array('script', 'style')))
544 foreach ($elements as $element)
546 $element->parentNode->removeChild($element);
553 foreach ($elements as $element)
555 $fragment = $document->createDocumentFragment();
556 $number = $element->childNodes->length;
557 for ($i = $number; $i > 0; $i--)
559 $child = $element->childNodes->item(0);
560 $fragment->appendChild($child);
563 $element->parentNode->replaceChild($fragment, $element);
568 protected function strip_attr($attrib, $xpath)
570 $elements = $xpath->query('//*[@' . $attrib . ']');
572 foreach ($elements as $element)
574 $element->removeAttribute($attrib);
578 protected function add_attr($tag, $valuePairs, $document)
580 $elements = $document->getElementsByTagName($tag);
581 foreach ($elements as $element)
583 foreach ($valuePairs as $attrib => $value)
585 $element->setAttribute($attrib, $value);