Merge branch 'MDL-60238-master' of git://github.com/ankitagarwal/moodle
[moodle.git] / lib / html2text / Html2Text.php
CommitLineData
ec2d33df
AN
1<?php
2
3/*
4 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
5 *
6 * This script is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * The GNU General Public License can be found at
12 * http://www.gnu.org/copyleft/gpl.html.
13 *
14 * This script is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 */
19
20namespace Html2Text;
21
22class Html2Text
23{
24 const ENCODING = 'UTF-8';
25
3e3f6245
CB
26 protected $htmlFuncFlags;
27
ec2d33df
AN
28 /**
29 * Contains the HTML content to convert.
30 *
31 * @type string
32 */
33 protected $html;
34
35 /**
36 * Contains the converted, formatted text.
37 *
38 * @type string
39 */
40 protected $text;
41
42 /**
43 * List of preg* regular expression patterns to search for,
44 * used in conjunction with $replace.
45 *
46 * @type array
47 * @see $replace
48 */
49 protected $search = array(
50 "/\r/", // Non-legal carriage return
51 "/[\n\t]+/", // Newlines and tabs
3e3f6245
CB
52 '/<head\b[^>]*>.*?<\/head>/i', // <head>
53 '/<script\b[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
54 '/<style\b[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
55 '/<i\b[^>]*>(.*?)<\/i>/i', // <i>
56 '/<em\b[^>]*>(.*?)<\/em>/i', // <em>
57 '/(<ul\b[^>]*>|<\/ul>)/i', // <ul> and </ul>
58 '/(<ol\b[^>]*>|<\/ol>)/i', // <ol> and </ol>
59 '/(<dl\b[^>]*>|<\/dl>)/i', // <dl> and </dl>
60 '/<li\b[^>]*>(.*?)<\/li>/i', // <li> and </li>
61 '/<dd\b[^>]*>(.*?)<\/dd>/i', // <dd> and </dd>
62 '/<dt\b[^>]*>(.*?)<\/dt>/i', // <dt> and </dt>
63 '/<li\b[^>]*>/i', // <li>
64 '/<hr\b[^>]*>/i', // <hr>
65 '/<div\b[^>]*>/i', // <div>
66 '/(<table\b[^>]*>|<\/table>)/i', // <table> and </table>
67 '/(<tr\b[^>]*>|<\/tr>)/i', // <tr> and </tr>
68 '/<td\b[^>]*>(.*?)<\/td>/i', // <td> and </td>
ec2d33df 69 '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
3e3f6245 70 '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag
ec2d33df
AN
71 );
72
73 /**
74 * List of pattern replacements corresponding to patterns searched.
75 *
76 * @type array
77 * @see $search
78 */
79 protected $replace = array(
80 '', // Non-legal carriage return
81 ' ', // Newlines and tabs
82 '', // <head>
83 '', // <script>s -- which strip_tags supposedly has problems with
84 '', // <style>s -- which strip_tags supposedly has problems with
ec2d33df
AN
85 '_\\1_', // <i>
86 '_\\1_', // <em>
87 "\n\n", // <ul> and </ul>
88 "\n\n", // <ol> and </ol>
89 "\n\n", // <dl> and </dl>
90 "\t* \\1\n", // <li> and </li>
91 " \\1\n", // <dd> and </dd>
92 "\t* \\1", // <dt> and </dt>
93 "\n\t* ", // <li>
94 "\n-------------------------\n", // <hr>
95 "<div>\n", // <div>
96 "\n\n", // <table> and </table>
97 "\n", // <tr> and </tr>
98 "\t\t\\1\n", // <td> and </td>
99 "", // <span class="_html2text_ignore">...</span>
100 '[\\2]', // <img> with alt tag
101 );
102
103 /**
104 * List of preg* regular expression patterns to search for,
105 * used in conjunction with $entReplace.
106 *
107 * @type array
108 * @see $entReplace
109 */
110 protected $entSearch = array(
111 '/&#153;/i', // TM symbol in win-1252
112 '/&#151;/i', // m-dash in win-1252
113 '/&(amp|#38);/i', // Ampersand: see converter()
114 '/[ ]{2,}/', // Runs of spaces, post-handling
115 );
116
117 /**
118 * List of pattern replacements corresponding to patterns searched.
119 *
120 * @type array
121 * @see $entSearch
122 */
123 protected $entReplace = array(
124 '™', // TM symbol
125 '—', // m-dash
126 '|+|amp|+|', // Ampersand: see converter()
127 ' ', // Runs of spaces, post-handling
128 );
129
130 /**
131 * List of preg* regular expression patterns to search for
132 * and replace using callback function.
133 *
134 * @type array
135 */
136 protected $callbackSearch = array(
137 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
3e3f6245
CB
138 '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace.
139 '/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline.
ec2d33df
AN
140 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
141 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
142 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
143 '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i' // <a href="">
144 );
145
146 /**
147 * List of preg* regular expression patterns to search for in PRE body,
148 * used in conjunction with $preReplace.
149 *
150 * @type array
151 * @see $preReplace
152 */
153 protected $preSearch = array(
154 "/\n/",
155 "/\t/",
156 '/ /',
157 '/<pre[^>]*>/',
158 '/<\/pre>/'
159 );
160
161 /**
162 * List of pattern replacements corresponding to patterns searched for PRE body.
163 *
164 * @type array
165 * @see $preSearch
166 */
167 protected $preReplace = array(
168 '<br>',
169 '&nbsp;&nbsp;&nbsp;&nbsp;',
170 '&nbsp;',
171 '',
172 '',
173 );
174
175 /**
176 * Temporary workspace used during PRE processing.
177 *
178 * @type string
179 */
180 protected $preContent = '';
181
182 /**
183 * Contains the base URL that relative links should resolve to.
184 *
185 * @type string
186 */
187 protected $baseurl = '';
188
189 /**
190 * Indicates whether content in the $html variable has been converted yet.
191 *
192 * @type boolean
193 * @see $html, $text
194 */
195 protected $converted = false;
196
197 /**
198 * Contains URL addresses from links to be rendered in plain text.
199 *
200 * @type array
201 * @see buildlinkList()
202 */
203 protected $linkList = array();
204
205 /**
206 * Various configuration options (able to be set in the constructor)
207 *
208 * @type array
209 */
210 protected $options = array(
211 'do_links' => 'inline', // 'none'
212 // 'inline' (show links inline)
213 // 'nextline' (show links on the next line)
214 // 'table' (if a table of link URLs should be listed after the text.
3e3f6245 215 // 'bbcode' (show links as bbcode)
ec2d33df
AN
216
217 'width' => 70, // Maximum width of the formatted text, in columns.
218 // Set this value to 0 (or less) to ignore word wrapping
219 // and not constrain text to a fixed-width column.
220 );
221
222 private function legacyConstruct($html = '', $fromFile = false, array $options = array())
223 {
224 $this->set_html($html, $fromFile);
225 $this->options = array_merge($this->options, $options);
226 }
227
228 /**
229 * @param string $html Source HTML
230 * @param array $options Set configuration options
231 */
232 public function __construct($html = '', $options = array())
233 {
234 // for backwards compatibility
235 if (!is_array($options)) {
236 return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
237 }
238
239 $this->html = $html;
240 $this->options = array_merge($this->options, $options);
3e3f6245
CB
241 $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
242 ? ENT_COMPAT
243 : ENT_COMPAT | ENT_HTML5;
ec2d33df
AN
244 }
245
c56c1d98
AA
246 /**
247 * Get the source HTML
248 *
249 * @return string
250 */
251 public function getHtml()
252 {
253 return $this->html;
254 }
255
ec2d33df
AN
256 /**
257 * Set the source HTML
258 *
259 * @param string $html HTML source content
260 */
261 public function setHtml($html)
262 {
263 $this->html = $html;
264 $this->converted = false;
265 }
266
267 /**
268 * @deprecated
269 */
270 public function set_html($html, $from_file = false)
271 {
272 if ($from_file) {
273 throw new \InvalidArgumentException("Argument from_file no longer supported");
274 }
275
276 return $this->setHtml($html);
277 }
278
279 /**
280 * Returns the text, converted from HTML.
281 *
282 * @return string
283 */
284 public function getText()
285 {
286 if (!$this->converted) {
287 $this->convert();
288 }
289
290 return $this->text;
291 }
292
293 /**
294 * @deprecated
295 */
296 public function get_text()
297 {
298 return $this->getText();
299 }
300
301 /**
302 * @deprecated
303 */
304 public function print_text()
305 {
306 print $this->getText();
307 }
308
309 /**
310 * @deprecated
311 */
312 public function p()
313 {
314 return $this->print_text();
315 }
316
317 /**
318 * Sets a base URL to handle relative links.
319 *
320 * @param string $baseurl
321 */
322 public function setBaseUrl($baseurl)
323 {
324 $this->baseurl = $baseurl;
325 }
326
327 /**
328 * @deprecated
329 */
330 public function set_base_url($baseurl)
331 {
332 return $this->setBaseUrl($baseurl);
333 }
334
335 protected function convert()
3e3f6245
CB
336 {
337 $origEncoding = mb_internal_encoding();
338 mb_internal_encoding(self::ENCODING);
339
340 $this->doConvert();
341
342 mb_internal_encoding($origEncoding);
343 }
344
345 protected function doConvert()
ec2d33df
AN
346 {
347 $this->linkList = array();
348
349 $text = trim($this->html);
350
351 $this->converter($text);
352
353 if ($this->linkList) {
354 $text .= "\n\nLinks:\n------\n";
355 foreach ($this->linkList as $i => $url) {
356 $text .= '[' . ($i + 1) . '] ' . $url . "\n";
357 }
358 }
359
360 $this->text = $text;
361
362 $this->converted = true;
363 }
364
365 protected function converter(&$text)
366 {
367 $this->convertBlockquotes($text);
368 $this->convertPre($text);
369 $text = preg_replace($this->search, $this->replace, $text);
370 $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
371 $text = strip_tags($text);
372 $text = preg_replace($this->entSearch, $this->entReplace, $text);
3e3f6245 373 $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
ec2d33df
AN
374
375 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
376 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
377
378 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
379 // This properly handles situation of "&amp;quot;" in input string
380 $text = str_replace('|+|amp|+|', '&', $text);
381
382 // Normalise empty lines
383 $text = preg_replace("/\n\s+\n/", "\n\n", $text);
384 $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
385
386 // remove leading empty lines (can be produced by eg. P tag on the beginning)
387 $text = ltrim($text, "\n");
388
389 if ($this->options['width'] > 0) {
390 $text = wordwrap($text, $this->options['width']);
391 }
392 }
393
394 /**
395 * Helper function called by preg_replace() on link replacement.
396 *
397 * Maintains an internal list of links to be displayed at the end of the
398 * text, with numeric indices to the original point in the text they
399 * appeared. Also makes an effort at identifying and handling absolute
400 * and relative links.
401 *
402 * @param string $link URL of the link
403 * @param string $display Part of the text to associate number with
404 * @param null $linkOverride
405 * @return string
406 */
407 protected function buildlinkList($link, $display, $linkOverride = null)
408 {
409 $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
410 if ($linkMethod == 'none') {
411 return $display;
412 }
413
414 // Ignored link types
415 if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
416 return $display;
417 }
418
419 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
420 $url = $link;
421 } else {
422 $url = $this->baseurl;
3e3f6245 423 if (mb_substr($link, 0, 1) != '/') {
ec2d33df
AN
424 $url .= '/';
425 }
426 $url .= $link;
427 }
428
429 if ($linkMethod == 'table') {
430 if (($index = array_search($url, $this->linkList)) === false) {
431 $index = count($this->linkList);
432 $this->linkList[] = $url;
433 }
434
435 return $display . ' [' . ($index + 1) . ']';
436 } elseif ($linkMethod == 'nextline') {
c56c1d98
AA
437 if ($url === $display) {
438 return $display;
439 }
ec2d33df 440 return $display . "\n[" . $url . ']';
3e3f6245
CB
441 } elseif ($linkMethod == 'bbcode') {
442 return sprintf('[url=%s]%s[/url]', $url, $display);
ec2d33df 443 } else { // link_method defaults to inline
c56c1d98
AA
444 if ($url === $display) {
445 return $display;
446 }
ec2d33df
AN
447 return $display . ' [' . $url . ']';
448 }
449 }
450
451 protected function convertPre(&$text)
452 {
453 // get the content of PRE element
454 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
3e3f6245
CB
455 // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
456 $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
ec2d33df
AN
457
458 // Run our defined tags search-and-replace with callback
459 $this->preContent = preg_replace_callback(
460 $this->callbackSearch,
461 array($this, 'pregCallback'),
462 $this->preContent
463 );
464
465 // convert the content
466 $this->preContent = sprintf(
467 '<div><br>%s<br></div>',
468 preg_replace($this->preSearch, $this->preReplace, $this->preContent)
469 );
470
471 // replace the content (use callback because content can contain $0 variable)
472 $text = preg_replace_callback(
473 '/<pre[^>]*>.*<\/pre>/ismU',
474 array($this, 'pregPreCallback'),
475 $text,
476 1
477 );
478
479 // free memory
480 $this->preContent = '';
481 }
482 }
483
484 /**
485 * Helper function for BLOCKQUOTE body conversion.
486 *
487 * @param string $text HTML content
488 */
489 protected function convertBlockquotes(&$text)
490 {
491 if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
3e3f6245 492 $originalText = $text;
ec2d33df
AN
493 $start = 0;
494 $taglen = 0;
495 $level = 0;
496 $diff = 0;
497 foreach ($matches[0] as $m) {
3e3f6245 498 $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
ec2d33df
AN
499 if ($m[0][0] == '<' && $m[0][1] == '/') {
500 $level--;
501 if ($level < 0) {
502 $level = 0; // malformed HTML: go to next blockquote
503 } elseif ($level > 0) {
504 // skip inner blockquote
505 } else {
506 $end = $m[1];
507 $len = $end - $taglen - $start;
508 // Get blockquote content
3e3f6245 509 $body = mb_substr($text, $start + $taglen - $diff, $len);
ec2d33df
AN
510
511 // Set text width
512 $pWidth = $this->options['width'];
513 if ($this->options['width'] > 0) $this->options['width'] -= 2;
514 // Convert blockquote content
515 $body = trim($body);
516 $this->converter($body);
517 // Add citation markers and create PRE block
518 $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
3e3f6245 519 $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
ec2d33df
AN
520 // Re-set text width
521 $this->options['width'] = $pWidth;
522 // Replace content
3e3f6245
CB
523 $text = mb_substr($text, 0, $start - $diff)
524 . $body
525 . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
ec2d33df 526
3e3f6245 527 $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
ec2d33df
AN
528 unset($body);
529 }
530 } else {
531 if ($level == 0) {
532 $start = $m[1];
3e3f6245 533 $taglen = mb_strlen($m[0]);
ec2d33df
AN
534 }
535 $level++;
536 }
537 }
538 }
539 }
540
541 /**
542 * Callback function for preg_replace_callback use.
543 *
544 * @param array $matches PREG matches
545 * @return string
546 */
547 protected function pregCallback($matches)
548 {
3e3f6245
CB
549 switch (mb_strtolower($matches[1])) {
550 case 'p':
551 // Replace newlines with spaces.
552 $para = str_replace("\n", " ", $matches[3]);
553
554 // Trim trailing and leading whitespace within the tag.
555 $para = trim($para);
556
557 // Add trailing newlines for this para.
558 return "\n" . $para . "\n";
559 case 'br':
560 return "\n";
ec2d33df
AN
561 case 'b':
562 case 'strong':
563 return $this->toupper($matches[3]);
564 case 'th':
565 return $this->toupper("\t\t" . $matches[3] . "\n");
566 case 'h':
567 return $this->toupper("\n\n" . $matches[3] . "\n\n");
568 case 'a':
569 // override the link method
570 $linkOverride = null;
571 if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
572 $linkOverride = $linkOverrideMatch[1];
573 }
574 // Remove spaces in URL (#1487805)
575 $url = str_replace(' ', '', $matches[3]);
576
577 return $this->buildlinkList($url, $matches[5], $linkOverride);
578 }
579
580 return '';
581 }
582
583 /**
584 * Callback function for preg_replace_callback use in PRE content handler.
585 *
586 * @param array $matches PREG matches
587 * @return string
588 */
589 protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
590 {
591 return $this->preContent;
592 }
593
594 /**
595 * Strtoupper function with HTML tags and entities handling.
596 *
597 * @param string $str Text to convert
598 * @return string Converted text
599 */
600 protected function toupper($str)
601 {
602 // string can contain HTML tags
3e3f6245 603 $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
ec2d33df
AN
604
605 // convert toupper only the text between HTML tags
606 foreach ($chunks as $i => $chunk) {
607 if ($chunk[0] != '<') {
608 $chunks[$i] = $this->strtoupper($chunk);
609 }
610 }
611
612 return implode($chunks);
613 }
614
615 /**
616 * Strtoupper multibyte wrapper function with HTML entities handling.
617 *
618 * @param string $str Text to convert
619 * @return string Converted text
620 */
621 protected function strtoupper($str)
622 {
3e3f6245
CB
623 $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
624 $str = mb_strtoupper($str);
625 $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
ec2d33df
AN
626
627 return $str;
628 }
629}