MDL-51524 libraries: Update to latest version of html2text
[moodle.git] / lib / html2text / Html2Text.php
CommitLineData
ec2d33df
AN
1<?php
2
3/*
4 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
5 *
6 * This script is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * The GNU General Public License can be found at
12 * http://www.gnu.org/copyleft/gpl.html.
13 *
14 * This script is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 */
19
20namespace Html2Text;
21
22class Html2Text
23{
24 const ENCODING = 'UTF-8';
25
26 /**
27 * Contains the HTML content to convert.
28 *
29 * @type string
30 */
31 protected $html;
32
33 /**
34 * Contains the converted, formatted text.
35 *
36 * @type string
37 */
38 protected $text;
39
40 /**
41 * List of preg* regular expression patterns to search for,
42 * used in conjunction with $replace.
43 *
44 * @type array
45 * @see $replace
46 */
47 protected $search = array(
48 "/\r/", // Non-legal carriage return
49 "/[\n\t]+/", // Newlines and tabs
50 '/<head[^>]*>.*?<\/head>/i', // <head>
51 '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
52 '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
53 '/<p[^>]*>/i', // <P>
54 '/<br[^>]*>/i', // <br>
55 '/<i[^>]*>(.*?)<\/i>/i', // <i>
56 '/<em[^>]*>(.*?)<\/em>/i', // <em>
57 '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
58 '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol>
59 '/(<dl[^>]*>|<\/dl>)/i', // <dl> and </dl>
60 '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li>
61 '/<dd[^>]*>(.*?)<\/dd>/i', // <dd> and </dd>
62 '/<dt[^>]*>(.*?)<\/dt>/i', // <dt> and </dt>
63 '/<li[^>]*>/i', // <li>
64 '/<hr[^>]*>/i', // <hr>
65 '/<div[^>]*>/i', // <div>
66 '/(<table[^>]*>|<\/table>)/i', // <table> and </table>
67 '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr>
68 '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td>
69 '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
70 '/<(img)[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag
71 );
72
73 /**
74 * List of pattern replacements corresponding to patterns searched.
75 *
76 * @type array
77 * @see $search
78 */
79 protected $replace = array(
80 '', // Non-legal carriage return
81 ' ', // Newlines and tabs
82 '', // <head>
83 '', // <script>s -- which strip_tags supposedly has problems with
84 '', // <style>s -- which strip_tags supposedly has problems with
85 "\n\n", // <P>
86 "\n", // <br>
87 '_\\1_', // <i>
88 '_\\1_', // <em>
89 "\n\n", // <ul> and </ul>
90 "\n\n", // <ol> and </ol>
91 "\n\n", // <dl> and </dl>
92 "\t* \\1\n", // <li> and </li>
93 " \\1\n", // <dd> and </dd>
94 "\t* \\1", // <dt> and </dt>
95 "\n\t* ", // <li>
96 "\n-------------------------\n", // <hr>
97 "<div>\n", // <div>
98 "\n\n", // <table> and </table>
99 "\n", // <tr> and </tr>
100 "\t\t\\1\n", // <td> and </td>
101 "", // <span class="_html2text_ignore">...</span>
102 '[\\2]', // <img> with alt tag
103 );
104
105 /**
106 * List of preg* regular expression patterns to search for,
107 * used in conjunction with $entReplace.
108 *
109 * @type array
110 * @see $entReplace
111 */
112 protected $entSearch = array(
113 '/&#153;/i', // TM symbol in win-1252
114 '/&#151;/i', // m-dash in win-1252
115 '/&(amp|#38);/i', // Ampersand: see converter()
116 '/[ ]{2,}/', // Runs of spaces, post-handling
117 );
118
119 /**
120 * List of pattern replacements corresponding to patterns searched.
121 *
122 * @type array
123 * @see $entSearch
124 */
125 protected $entReplace = array(
126 '™', // TM symbol
127 '—', // m-dash
128 '|+|amp|+|', // Ampersand: see converter()
129 ' ', // Runs of spaces, post-handling
130 );
131
132 /**
133 * List of preg* regular expression patterns to search for
134 * and replace using callback function.
135 *
136 * @type array
137 */
138 protected $callbackSearch = array(
139 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
140 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
141 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
142 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
143 '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i' // <a href="">
144 );
145
146 /**
147 * List of preg* regular expression patterns to search for in PRE body,
148 * used in conjunction with $preReplace.
149 *
150 * @type array
151 * @see $preReplace
152 */
153 protected $preSearch = array(
154 "/\n/",
155 "/\t/",
156 '/ /',
157 '/<pre[^>]*>/',
158 '/<\/pre>/'
159 );
160
161 /**
162 * List of pattern replacements corresponding to patterns searched for PRE body.
163 *
164 * @type array
165 * @see $preSearch
166 */
167 protected $preReplace = array(
168 '<br>',
169 '&nbsp;&nbsp;&nbsp;&nbsp;',
170 '&nbsp;',
171 '',
172 '',
173 );
174
175 /**
176 * Temporary workspace used during PRE processing.
177 *
178 * @type string
179 */
180 protected $preContent = '';
181
182 /**
183 * Contains the base URL that relative links should resolve to.
184 *
185 * @type string
186 */
187 protected $baseurl = '';
188
189 /**
190 * Indicates whether content in the $html variable has been converted yet.
191 *
192 * @type boolean
193 * @see $html, $text
194 */
195 protected $converted = false;
196
197 /**
198 * Contains URL addresses from links to be rendered in plain text.
199 *
200 * @type array
201 * @see buildlinkList()
202 */
203 protected $linkList = array();
204
205 /**
206 * Various configuration options (able to be set in the constructor)
207 *
208 * @type array
209 */
210 protected $options = array(
211 'do_links' => 'inline', // 'none'
212 // 'inline' (show links inline)
213 // 'nextline' (show links on the next line)
214 // 'table' (if a table of link URLs should be listed after the text.
215
216 'width' => 70, // Maximum width of the formatted text, in columns.
217 // Set this value to 0 (or less) to ignore word wrapping
218 // and not constrain text to a fixed-width column.
219 );
220
221 private function legacyConstruct($html = '', $fromFile = false, array $options = array())
222 {
223 $this->set_html($html, $fromFile);
224 $this->options = array_merge($this->options, $options);
225 }
226
227 /**
228 * @param string $html Source HTML
229 * @param array $options Set configuration options
230 */
231 public function __construct($html = '', $options = array())
232 {
233 // for backwards compatibility
234 if (!is_array($options)) {
235 return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
236 }
237
238 $this->html = $html;
239 $this->options = array_merge($this->options, $options);
240 }
241
242 /**
243 * Set the source HTML
244 *
245 * @param string $html HTML source content
246 */
247 public function setHtml($html)
248 {
249 $this->html = $html;
250 $this->converted = false;
251 }
252
253 /**
254 * @deprecated
255 */
256 public function set_html($html, $from_file = false)
257 {
258 if ($from_file) {
259 throw new \InvalidArgumentException("Argument from_file no longer supported");
260 }
261
262 return $this->setHtml($html);
263 }
264
265 /**
266 * Returns the text, converted from HTML.
267 *
268 * @return string
269 */
270 public function getText()
271 {
272 if (!$this->converted) {
273 $this->convert();
274 }
275
276 return $this->text;
277 }
278
279 /**
280 * @deprecated
281 */
282 public function get_text()
283 {
284 return $this->getText();
285 }
286
287 /**
288 * @deprecated
289 */
290 public function print_text()
291 {
292 print $this->getText();
293 }
294
295 /**
296 * @deprecated
297 */
298 public function p()
299 {
300 return $this->print_text();
301 }
302
303 /**
304 * Sets a base URL to handle relative links.
305 *
306 * @param string $baseurl
307 */
308 public function setBaseUrl($baseurl)
309 {
310 $this->baseurl = $baseurl;
311 }
312
313 /**
314 * @deprecated
315 */
316 public function set_base_url($baseurl)
317 {
318 return $this->setBaseUrl($baseurl);
319 }
320
321 protected function convert()
322 {
323 $this->linkList = array();
324
325 $text = trim($this->html);
326
327 $this->converter($text);
328
329 if ($this->linkList) {
330 $text .= "\n\nLinks:\n------\n";
331 foreach ($this->linkList as $i => $url) {
332 $text .= '[' . ($i + 1) . '] ' . $url . "\n";
333 }
334 }
335
336 $this->text = $text;
337
338 $this->converted = true;
339 }
340
341 protected function converter(&$text)
342 {
343 $this->convertBlockquotes($text);
344 $this->convertPre($text);
345 $text = preg_replace($this->search, $this->replace, $text);
346 $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
347 $text = strip_tags($text);
348 $text = preg_replace($this->entSearch, $this->entReplace, $text);
349 $text = html_entity_decode($text, ENT_QUOTES, self::ENCODING);
350
351 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
352 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
353
354 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
355 // This properly handles situation of "&amp;quot;" in input string
356 $text = str_replace('|+|amp|+|', '&', $text);
357
358 // Normalise empty lines
359 $text = preg_replace("/\n\s+\n/", "\n\n", $text);
360 $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
361
362 // remove leading empty lines (can be produced by eg. P tag on the beginning)
363 $text = ltrim($text, "\n");
364
365 if ($this->options['width'] > 0) {
366 $text = wordwrap($text, $this->options['width']);
367 }
368 }
369
370 /**
371 * Helper function called by preg_replace() on link replacement.
372 *
373 * Maintains an internal list of links to be displayed at the end of the
374 * text, with numeric indices to the original point in the text they
375 * appeared. Also makes an effort at identifying and handling absolute
376 * and relative links.
377 *
378 * @param string $link URL of the link
379 * @param string $display Part of the text to associate number with
380 * @param null $linkOverride
381 * @return string
382 */
383 protected function buildlinkList($link, $display, $linkOverride = null)
384 {
385 $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
386 if ($linkMethod == 'none') {
387 return $display;
388 }
389
390 // Ignored link types
391 if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
392 return $display;
393 }
394
395 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
396 $url = $link;
397 } else {
398 $url = $this->baseurl;
399 if (substr($link, 0, 1) != '/') {
400 $url .= '/';
401 }
402 $url .= $link;
403 }
404
405 if ($linkMethod == 'table') {
406 if (($index = array_search($url, $this->linkList)) === false) {
407 $index = count($this->linkList);
408 $this->linkList[] = $url;
409 }
410
411 return $display . ' [' . ($index + 1) . ']';
412 } elseif ($linkMethod == 'nextline') {
413 return $display . "\n[" . $url . ']';
414 } else { // link_method defaults to inline
415 return $display . ' [' . $url . ']';
416 }
417 }
418
419 protected function convertPre(&$text)
420 {
421 // get the content of PRE element
422 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
423 $this->preContent = $matches[1];
424
425 // Run our defined tags search-and-replace with callback
426 $this->preContent = preg_replace_callback(
427 $this->callbackSearch,
428 array($this, 'pregCallback'),
429 $this->preContent
430 );
431
432 // convert the content
433 $this->preContent = sprintf(
434 '<div><br>%s<br></div>',
435 preg_replace($this->preSearch, $this->preReplace, $this->preContent)
436 );
437
438 // replace the content (use callback because content can contain $0 variable)
439 $text = preg_replace_callback(
440 '/<pre[^>]*>.*<\/pre>/ismU',
441 array($this, 'pregPreCallback'),
442 $text,
443 1
444 );
445
446 // free memory
447 $this->preContent = '';
448 }
449 }
450
451 /**
452 * Helper function for BLOCKQUOTE body conversion.
453 *
454 * @param string $text HTML content
455 */
456 protected function convertBlockquotes(&$text)
457 {
458 if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
459 $start = 0;
460 $taglen = 0;
461 $level = 0;
462 $diff = 0;
463 foreach ($matches[0] as $m) {
464 if ($m[0][0] == '<' && $m[0][1] == '/') {
465 $level--;
466 if ($level < 0) {
467 $level = 0; // malformed HTML: go to next blockquote
468 } elseif ($level > 0) {
469 // skip inner blockquote
470 } else {
471 $end = $m[1];
472 $len = $end - $taglen - $start;
473 // Get blockquote content
474 $body = substr($text, $start + $taglen - $diff, $len);
475
476 // Set text width
477 $pWidth = $this->options['width'];
478 if ($this->options['width'] > 0) $this->options['width'] -= 2;
479 // Convert blockquote content
480 $body = trim($body);
481 $this->converter($body);
482 // Add citation markers and create PRE block
483 $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
484 $body = '<pre>' . htmlspecialchars($body) . '</pre>';
485 // Re-set text width
486 $this->options['width'] = $pWidth;
487 // Replace content
488 $text = substr($text, 0, $start - $diff)
489 . $body . substr($text, $end + strlen($m[0]) - $diff);
490
491 $diff = $len + $taglen + strlen($m[0]) - strlen($body);
492 unset($body);
493 }
494 } else {
495 if ($level == 0) {
496 $start = $m[1];
497 $taglen = strlen($m[0]);
498 }
499 $level++;
500 }
501 }
502 }
503 }
504
505 /**
506 * Callback function for preg_replace_callback use.
507 *
508 * @param array $matches PREG matches
509 * @return string
510 */
511 protected function pregCallback($matches)
512 {
513 switch (strtolower($matches[1])) {
514 case 'b':
515 case 'strong':
516 return $this->toupper($matches[3]);
517 case 'th':
518 return $this->toupper("\t\t" . $matches[3] . "\n");
519 case 'h':
520 return $this->toupper("\n\n" . $matches[3] . "\n\n");
521 case 'a':
522 // override the link method
523 $linkOverride = null;
524 if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
525 $linkOverride = $linkOverrideMatch[1];
526 }
527 // Remove spaces in URL (#1487805)
528 $url = str_replace(' ', '', $matches[3]);
529
530 return $this->buildlinkList($url, $matches[5], $linkOverride);
531 }
532
533 return '';
534 }
535
536 /**
537 * Callback function for preg_replace_callback use in PRE content handler.
538 *
539 * @param array $matches PREG matches
540 * @return string
541 */
542 protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
543 {
544 return $this->preContent;
545 }
546
547 /**
548 * Strtoupper function with HTML tags and entities handling.
549 *
550 * @param string $str Text to convert
551 * @return string Converted text
552 */
553 protected function toupper($str)
554 {
555 // string can contain HTML tags
556 $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
557
558 // convert toupper only the text between HTML tags
559 foreach ($chunks as $i => $chunk) {
560 if ($chunk[0] != '<') {
561 $chunks[$i] = $this->strtoupper($chunk);
562 }
563 }
564
565 return implode($chunks);
566 }
567
568 /**
569 * Strtoupper multibyte wrapper function with HTML entities handling.
570 *
571 * @param string $str Text to convert
572 * @return string Converted text
573 */
574 protected function strtoupper($str)
575 {
576 $str = html_entity_decode($str, ENT_COMPAT, self::ENCODING);
577
578 if (function_exists('mb_strtoupper')) {
579 $str = mb_strtoupper($str, self::ENCODING);
580 } else {
581 $str = strtoupper($str);
582 }
583
584 $str = htmlspecialchars($str, ENT_COMPAT, self::ENCODING);
585
586 return $str;
587 }
588}