From ea0f9a277c1f67f6e4ed8f21487aacf115291651 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Petr=20S=CC=8Ckoda?= Date: Sun, 9 Dec 2012 14:23:59 +0100 Subject: [PATCH] MDL-36212 rework html entity conversions This should resolve all html entity conversion problems in different PHP versions. --- .../spellchecker/classes/GoogleSpell.php | 1 + lib/tests/textlib_test.php | 13 ++-- lib/textlib.class.php | 75 +++++++++++++------ lib/weblib.php | 4 +- 4 files changed, 64 insertions(+), 29 deletions(-) diff --git a/lib/editor/tinymce/tiny_mce/3.5.1.1/plugins/spellchecker/classes/GoogleSpell.php b/lib/editor/tinymce/tiny_mce/3.5.1.1/plugins/spellchecker/classes/GoogleSpell.php index e3acf2d878c..982891f65a6 100644 --- a/lib/editor/tinymce/tiny_mce/3.5.1.1/plugins/spellchecker/classes/GoogleSpell.php +++ b/lib/editor/tinymce/tiny_mce/3.5.1.1/plugins/spellchecker/classes/GoogleSpell.php @@ -126,6 +126,7 @@ class GoogleSpell extends SpellChecker { } function _unhtmlentities($string) { + return textlib::entities_to_utf8($string); // Moodle hack $string = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $string); $string = preg_replace('~&#([0-9]+);~e', 'chr(\\1)', $string); diff --git a/lib/tests/textlib_test.php b/lib/tests/textlib_test.php index f0715313683..7a6e84382dc 100644 --- a/lib/tests/textlib_test.php +++ b/lib/tests/textlib_test.php @@ -293,8 +293,8 @@ class core_textlib_testcase extends basic_testcase { * @return void */ public function test_entities_to_utf8() { - $str = "Žluťoučký koníček"; - $this->assertSame(textlib::entities_to_utf8($str), "ŽluÅ¥oučký koníček"); + $str = "Žluťoučký koníček©"&<>§«"; + $this->assertSame("ŽluÅ¥oučký koníček©\"&<>§«", textlib::entities_to_utf8($str)); } /** @@ -302,10 +302,13 @@ class core_textlib_testcase extends basic_testcase { * @return void */ public function test_utf8_to_entities() { - $str = "ŽluÅ¥oučký koníček"; - $this->assertSame(textlib::utf8_to_entities($str), "Žluťoučký koníček"); - $this->assertSame(textlib::utf8_to_entities($str, true), "Žluťoučký koníček"); + $str = "ŽluÅ¥oučký koníček©"&<>§«"; + $this->assertSame("Žluťoučký koníček©"&<>§«", textlib::utf8_to_entities($str)); + $this->assertSame("Žluťoučký koníček©"&<>§«", textlib::utf8_to_entities($str, true)); + $str = "ŽluÅ¥oučký koníček©"&<>§«"; + $this->assertSame("Žluťoučký koníček©\"&<>§«", textlib::utf8_to_entities($str, false, true)); + $this->assertSame("Žluťoučký koníček©\"&<>§«", textlib::utf8_to_entities($str, true, true)); } /** diff --git a/lib/textlib.class.php b/lib/textlib.class.php index ab0db3c4780..bddcaa721d6 100644 --- a/lib/textlib.class.php +++ b/lib/textlib.class.php @@ -441,6 +441,34 @@ class textlib { return $encoded; } + /** + * Returns HTML entity transliteration table. + * @return array with (html entity => utf-8) elements + */ + protected static function get_entities_table() { + static $trans_tbl = null; + + // Generate/create $trans_tbl + if (!isset($trans_tbl)) { + if (version_compare(phpversion(), '5.3.4') < 0) { + $trans_tbl = array(); + foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key) { + $trans_tbl[$key] = textlib::convert($val, 'ISO-8859-1', 'utf-8'); + } + + } else if (version_compare(phpversion(), '5.4.0') < 0) { + $trans_tbl = get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'); + $trans_tbl = array_flip($trans_tbl); + + } else { + $trans_tbl = get_html_translation_table(HTML_ENTITIES, ENT_COMPAT | ENT_HTML401, 'UTF-8'); + $trans_tbl = array_flip($trans_tbl); + } + } + + return $trans_tbl; + } + /** * Converts all the numeric entities &#nnnn; or &#xnnn; to UTF-8 * Original from laurynas dot butkus at gmail at: @@ -450,28 +478,24 @@ class textlib { * @param string $str input string * @param boolean $htmlent convert also html entities (defaults to true) * @return string encoded UTF-8 string - * - * NOTE: we could have used typo3 entities_to_utf8() here - * but the direct alternative used runs 400% quicker - * and uses 0.5Mb less memory, so, let's use it - * (tested against 10^6 conversions) */ public static function entities_to_utf8($str, $htmlent=true) { - static $trans_tbl; // Going to use static transliteration table + static $callback1 = null ; + static $callback2 = null ; + + if (!$callback1 or !$callback2) { + $callback1 = create_function('$matches', 'return textlib::code2utf8(hexdec($matches[1]));'); + $callback2 = create_function('$matches', 'return textlib::code2utf8($matches[1]);'); + } - // Replace numeric entities - $result = preg_replace('~&#x([0-9a-f]+);~ei', 'textlib::code2utf8(hexdec("\\1"))', $str); - $result = preg_replace('~&#([0-9]+);~e', 'textlib::code2utf8(\\1)', $result); + $result = (string)$str; + $result = preg_replace_callback('/&#x([0-9a-f]+);/i', $callback1, $result); + $result = preg_replace_callback('/&#([0-9]+);/', $callback2, $result); // Replace literal entities (if desired) if ($htmlent) { - // Generate/create $trans_tbl - if (!isset($trans_tbl)) { - $trans_tbl = array(); - foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key) { - $trans_tbl[$key] = utf8_encode($val); - } - } + $trans_tbl = self::get_entities_table(); + // It should be safe to search for ascii strings and replace them with utf-8 here. $result = strtr($result, $trans_tbl); } // Return utf8-ised string @@ -487,17 +511,24 @@ class textlib { * @return string converted string */ public static function utf8_to_entities($str, $dec=false, $nonnum=false) { - // Avoid some notices from Typo3 code - $oldlevel = error_reporting(E_PARSE); + static $callback = null ; + if ($nonnum) { - $str = self::typo3()->entities_to_utf8((string)$str, true); + $str = self::entities_to_utf8($str, true); } + + // Avoid some notices from Typo3 code + $oldlevel = error_reporting(E_PARSE); $result = self::typo3()->utf8_to_entities((string)$str); + error_reporting($oldlevel); + if ($dec) { - $result = preg_replace('/&#x([0-9a-f]+);/ie', "'&#'.hexdec('$1').';'", $result); + if (!$callback) { + $callback = create_function('$matches', 'return \'&#\'.(hexdec($matches[1])).\';\';'); + } + $result = preg_replace_callback('/&#x([0-9a-f]+);/i', $callback, $result); } - // Restore original debug level - error_reporting($oldlevel); + return $result; } diff --git a/lib/weblib.php b/lib/weblib.php index 84362feaa04..500467bef06 100644 --- a/lib/weblib.php +++ b/lib/weblib.php @@ -1382,7 +1382,7 @@ function format_text_email($text, $format) { case FORMAT_WIKI: // there should not be any of these any more! $text = wikify_links($text); - return strtr(strip_tags($text), array_flip(get_html_translation_table(HTML_ENTITIES))); + return textlib::entities_to_utf8(strip_tags($text), true); break; case FORMAT_HTML: @@ -1393,7 +1393,7 @@ function format_text_email($text, $format) { case FORMAT_MARKDOWN: default: $text = wikify_links($text); - return strtr(strip_tags($text), array_flip(get_html_translation_table(HTML_ENTITIES))); + return textlib::entities_to_utf8(strip_tags($text), true); break; } } -- 2.43.0