MDL-36212 rework html entity conversions
authorPetr Škoda <commits@skodak.org>
Sun, 9 Dec 2012 13:23:59 +0000 (14:23 +0100)
committerPetr Škoda <commits@skodak.org>
Thu, 20 Dec 2012 21:49:20 +0000 (22:49 +0100)
This should resolve all html entity conversion problems in different PHP versions.

lib/editor/tinymce/tiny_mce/3.5.1.1/plugins/spellchecker/classes/GoogleSpell.php
lib/tests/textlib_test.php
lib/textlib.class.php
lib/weblib.php

index e3acf2d..982891f 100644 (file)
@@ -126,6 +126,7 @@ class GoogleSpell extends SpellChecker {
        }\r
 \r
        function _unhtmlentities($string) {\r
+        return textlib::entities_to_utf8($string); // Moodle hack\r
                $string = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $string);\r
                $string = preg_replace('~&#([0-9]+);~e', 'chr(\\1)', $string);\r
 \r
index f071531..7a6e843 100644 (file)
@@ -293,8 +293,8 @@ class core_textlib_testcase extends basic_testcase {
      * @return void
      */
     public function test_entities_to_utf8() {
-        $str = "&#x17d;lu&#x165;ou&#x10d;k&#xfd; kon&#237;&#269;ek";
-        $this->assertSame(textlib::entities_to_utf8($str), "Žluťoučký koníček");
+        $str = "&#x17d;lu&#x165;ou&#x10d;k&#xfd; kon&iacute;&#269;ek&copy;&quot;&amp;&lt;&gt;&sect;&laquo;";
+        $this->assertSame("Žluťoučký koníček©\"&<>§«", textlib::entities_to_utf8($str));
     }
 
     /**
@@ -302,10 +302,13 @@ class core_textlib_testcase extends basic_testcase {
      * @return void
      */
     public function test_utf8_to_entities() {
-        $str = "Žluťoučký koníček";
-        $this->assertSame(textlib::utf8_to_entities($str), "&#x17d;lu&#x165;ou&#x10d;k&#xfd; kon&#xed;&#x10d;ek");
-        $this->assertSame(textlib::utf8_to_entities($str, true), "&#381;lu&#357;ou&#269;k&#253; kon&#237;&#269;ek");
+        $str = "&#x17d;luťoučký kon&iacute;ček&copy;&quot;&amp;&lt;&gt;&sect;&laquo;";
+        $this->assertSame("&#x17d;lu&#x165;ou&#x10d;k&#xfd; kon&iacute;&#x10d;ek&copy;&quot;&amp;&lt;&gt;&sect;&laquo;", textlib::utf8_to_entities($str));
+        $this->assertSame("&#381;lu&#357;ou&#269;k&#253; kon&iacute;&#269;ek&copy;&quot;&amp;&lt;&gt;&sect;&laquo;", textlib::utf8_to_entities($str, true));
 
+        $str = "&#381;luťoučký kon&iacute;ček&copy;&quot;&amp;&lt;&gt;&sect;&laquo;";
+        $this->assertSame("&#x17d;lu&#x165;ou&#x10d;k&#xfd; kon&#xed;&#x10d;ek&#xa9;\"&<>&#xa7;&#xab;", textlib::utf8_to_entities($str, false, true));
+        $this->assertSame("&#381;lu&#357;ou&#269;k&#253; kon&#237;&#269;ek&#169;\"&<>&#167;&#171;", textlib::utf8_to_entities($str, true, true));
     }
 
     /**
index ab0db3c..bddcaa7 100644 (file)
@@ -441,6 +441,34 @@ class textlib {
         return $encoded;
     }
 
+    /**
+     * Returns HTML entity transliteration table.
+     * @return array with (html entity => utf-8) elements
+     */
+    protected static function get_entities_table() {
+        static $trans_tbl = null;
+
+        // Generate/create $trans_tbl
+        if (!isset($trans_tbl)) {
+            if (version_compare(phpversion(), '5.3.4') < 0) {
+                $trans_tbl = array();
+                foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key) {
+                    $trans_tbl[$key] = textlib::convert($val, 'ISO-8859-1', 'utf-8');
+                }
+
+            } else if (version_compare(phpversion(), '5.4.0') < 0) {
+                $trans_tbl = get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8');
+                $trans_tbl = array_flip($trans_tbl);
+
+            } else {
+                $trans_tbl = get_html_translation_table(HTML_ENTITIES, ENT_COMPAT | ENT_HTML401, 'UTF-8');
+                $trans_tbl = array_flip($trans_tbl);
+            }
+        }
+
+        return $trans_tbl;
+    }
+
     /**
      * Converts all the numeric entities &#nnnn; or &#xnnn; to UTF-8
      * Original from laurynas dot butkus at gmail at:
@@ -450,28 +478,24 @@ class textlib {
      * @param string $str input string
      * @param boolean $htmlent convert also html entities (defaults to true)
      * @return string encoded UTF-8 string
-     *
-     * NOTE: we could have used typo3 entities_to_utf8() here
-     *       but the direct alternative used runs 400% quicker
-     *       and uses 0.5Mb less memory, so, let's use it
-     *       (tested against 10^6 conversions)
      */
     public static function entities_to_utf8($str, $htmlent=true) {
-        static $trans_tbl; // Going to use static transliteration table
+        static $callback1 = null ;
+        static $callback2 = null ;
+
+        if (!$callback1 or !$callback2) {
+            $callback1 = create_function('$matches', 'return textlib::code2utf8(hexdec($matches[1]));');
+            $callback2 = create_function('$matches', 'return textlib::code2utf8($matches[1]);');
+        }
 
-        // Replace numeric entities
-        $result = preg_replace('~&#x([0-9a-f]+);~ei', 'textlib::code2utf8(hexdec("\\1"))', $str);
-        $result = preg_replace('~&#([0-9]+);~e', 'textlib::code2utf8(\\1)', $result);
+        $result = (string)$str;
+        $result = preg_replace_callback('/&#x([0-9a-f]+);/i', $callback1, $result);
+        $result = preg_replace_callback('/&#([0-9]+);/', $callback2, $result);
 
         // Replace literal entities (if desired)
         if ($htmlent) {
-            // Generate/create $trans_tbl
-            if (!isset($trans_tbl)) {
-                $trans_tbl = array();
-                foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key) {
-                    $trans_tbl[$key] = utf8_encode($val);
-                }
-            }
+            $trans_tbl = self::get_entities_table();
+            // It should be safe to search for ascii strings and replace them with utf-8 here.
             $result = strtr($result, $trans_tbl);
         }
         // Return utf8-ised string
@@ -487,17 +511,24 @@ class textlib {
      * @return string converted string
      */
     public static function utf8_to_entities($str, $dec=false, $nonnum=false) {
-        // Avoid some notices from Typo3 code
-        $oldlevel = error_reporting(E_PARSE);
+        static $callback = null ;
+
         if ($nonnum) {
-            $str = self::typo3()->entities_to_utf8((string)$str, true);
+            $str = self::entities_to_utf8($str, true);
         }
+
+        // Avoid some notices from Typo3 code
+        $oldlevel = error_reporting(E_PARSE);
         $result = self::typo3()->utf8_to_entities((string)$str);
+        error_reporting($oldlevel);
+
         if ($dec) {
-            $result = preg_replace('/&#x([0-9a-f]+);/ie', "'&#'.hexdec('$1').';'", $result);
+            if (!$callback) {
+                $callback = create_function('$matches', 'return \'&#\'.(hexdec($matches[1])).\';\';');
+            }
+            $result = preg_replace_callback('/&#x([0-9a-f]+);/i', $callback, $result);
         }
-        // Restore original debug level
-        error_reporting($oldlevel);
+
         return $result;
     }
 
index 84362fe..500467b 100644 (file)
@@ -1382,7 +1382,7 @@ function format_text_email($text, $format) {
         case FORMAT_WIKI:
             // there should not be any of these any more!
             $text = wikify_links($text);
-            return strtr(strip_tags($text), array_flip(get_html_translation_table(HTML_ENTITIES)));
+            return textlib::entities_to_utf8(strip_tags($text), true);
             break;
 
         case FORMAT_HTML:
@@ -1393,7 +1393,7 @@ function format_text_email($text, $format) {
         case FORMAT_MARKDOWN:
         default:
             $text = wikify_links($text);
-            return strtr(strip_tags($text), array_flip(get_html_translation_table(HTML_ENTITIES)));
+            return textlib::entities_to_utf8(strip_tags($text), true);
             break;
     }
 }