MDL-41904 import Markdown lib 1.3
[moodle.git] / lib / markdown / Markdown.php
1 <?php
2 #
3 # Markdown  -  A text-to-HTML conversion tool for web writers
4 #
5 # PHP Markdown  
6 # Copyright (c) 2004-2013 Michel Fortin  
7 # <http://michelf.com/projects/php-markdown/>
8 #
9 # Original Markdown  
10 # Copyright (c) 2004-2006 John Gruber  
11 # <http://daringfireball.net/projects/markdown/>
12 #
13 namespace Michelf;
16 #
17 # Markdown Parser Class
18 #
20 class Markdown {
22         ### Version ###
24         const  MARKDOWNLIB_VERSION  =  "1.3";
26         ### Simple Function Interface ###
28         public static function defaultTransform($text) {
29         #
30         # Initialize the parser and return the result of its transform method.
31         # This will work fine for derived classes too.
32         #
33                 # Take parser class on which this function was called.
34                 $parser_class = \get_called_class();
36                 # try to take parser from the static parser list
37                 static $parser_list;
38                 $parser =& $parser_list[$parser_class];
40                 # create the parser it not already set
41                 if (!$parser)
42                         $parser = new $parser_class;
44                 # Transform text using parser.
45                 return $parser->transform($text);
46         }
48         ### Configuration Variables ###
50         # Change to ">" for HTML output.
51         public $empty_element_suffix = " />";
52         public $tab_width = 4;
53         
54         # Change to `true` to disallow markup or entities.
55         public $no_markup = false;
56         public $no_entities = false;
57         
58         # Predefined urls and titles for reference links and images.
59         public $predef_urls = array();
60         public $predef_titles = array();
63         ### Parser Implementation ###
65         # Regex to match balanced [brackets].
66         # Needed to insert a maximum bracked depth while converting to PHP.
67         protected $nested_brackets_depth = 6;
68         protected $nested_brackets_re;
69         
70         protected $nested_url_parenthesis_depth = 4;
71         protected $nested_url_parenthesis_re;
73         # Table of hash values for escaped characters:
74         protected $escape_chars = '\`*_{}[]()>#+-.!';
75         protected $escape_chars_re;
78         public function __construct() {
79         #
80         # Constructor function. Initialize appropriate member variables.
81         #
82                 $this->_initDetab();
83                 $this->prepareItalicsAndBold();
84         
85                 $this->nested_brackets_re = 
86                         str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
87                         str_repeat('\])*', $this->nested_brackets_depth);
88         
89                 $this->nested_url_parenthesis_re = 
90                         str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth).
91                         str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
92                 
93                 $this->escape_chars_re = '['.preg_quote($this->escape_chars).']';
94                 
95                 # Sort document, block, and span gamut in ascendent priority order.
96                 asort($this->document_gamut);
97                 asort($this->block_gamut);
98                 asort($this->span_gamut);
99         }
102         # Internal hashes used during transformation.
103         protected $urls = array();
104         protected $titles = array();
105         protected $html_hashes = array();
106         
107         # Status flag to avoid invalid nesting.
108         protected $in_anchor = false;
109         
110         
111         protected function setup() {
112         #
113         # Called before the transformation process starts to setup parser 
114         # states.
115         #
116                 # Clear global hashes.
117                 $this->urls = $this->predef_urls;
118                 $this->titles = $this->predef_titles;
119                 $this->html_hashes = array();
120                 
121                 $this->in_anchor = false;
122         }
123         
124         protected function teardown() {
125         #
126         # Called after the transformation process to clear any variable 
127         # which may be taking up memory unnecessarly.
128         #
129                 $this->urls = array();
130                 $this->titles = array();
131                 $this->html_hashes = array();
132         }
135         public function transform($text) {
136         #
137         # Main function. Performs some preprocessing on the input text
138         # and pass it through the document gamut.
139         #
140                 $this->setup();
141         
142                 # Remove UTF-8 BOM and marker character in input, if present.
143                 $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
145                 # Standardize line endings:
146                 #   DOS to Unix and Mac to Unix
147                 $text = preg_replace('{\r\n?}', "\n", $text);
149                 # Make sure $text ends with a couple of newlines:
150                 $text .= "\n\n";
152                 # Convert all tabs to spaces.
153                 $text = $this->detab($text);
155                 # Turn block-level HTML blocks into hash entries
156                 $text = $this->hashHTMLBlocks($text);
158                 # Strip any lines consisting only of spaces and tabs.
159                 # This makes subsequent regexen easier to write, because we can
160                 # match consecutive blank lines with /\n+/ instead of something
161                 # contorted like /[ ]*\n+/ .
162                 $text = preg_replace('/^[ ]+$/m', '', $text);
164                 # Run document gamut methods.
165                 foreach ($this->document_gamut as $method => $priority) {
166                         $text = $this->$method($text);
167                 }
168                 
169                 $this->teardown();
171                 return $text . "\n";
172         }
173         
174         protected $document_gamut = array(
175                 # Strip link definitions, store in hashes.
176                 "stripLinkDefinitions" => 20,
177                 
178                 "runBasicBlockGamut"   => 30,
179                 );
182         protected function stripLinkDefinitions($text) {
183         #
184         # Strips link definitions from text, stores the URLs and titles in
185         # hash references.
186         #
187                 $less_than_tab = $this->tab_width - 1;
189                 # Link defs are in the form: ^[id]: url "optional title"
190                 $text = preg_replace_callback('{
191                                                         ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
192                                                           [ ]*
193                                                           \n?                           # maybe *one* newline
194                                                           [ ]*
195                                                         (?:
196                                                           <(.+?)>                       # url = $2
197                                                         |
198                                                           (\S+?)                        # url = $3
199                                                         )
200                                                           [ ]*
201                                                           \n?                           # maybe one newline
202                                                           [ ]*
203                                                         (?:
204                                                                 (?<=\s)                 # lookbehind for whitespace
205                                                                 ["(]
206                                                                 (.*?)                   # title = $4
207                                                                 [")]
208                                                                 [ ]*
209                                                         )?      # title is optional
210                                                         (?:\n+|\Z)
211                         }xm',
212                         array(&$this, '_stripLinkDefinitions_callback'),
213                         $text);
214                 return $text;
215         }
216         protected function _stripLinkDefinitions_callback($matches) {
217                 $link_id = strtolower($matches[1]);
218                 $url = $matches[2] == '' ? $matches[3] : $matches[2];
219                 $this->urls[$link_id] = $url;
220                 $this->titles[$link_id] =& $matches[4];
221                 return ''; # String that will replace the block
222         }
225         protected function hashHTMLBlocks($text) {
226                 if ($this->no_markup)  return $text;
228                 $less_than_tab = $this->tab_width - 1;
230                 # Hashify HTML blocks:
231                 # We only want to do this for block-level HTML tags, such as headers,
232                 # lists, and tables. That's because we still want to wrap <p>s around
233                 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
234                 # phrase emphasis, and spans. The list of tags we're looking for is
235                 # hard-coded:
236                 #
237                 # *  List "a" is made of tags which can be both inline or block-level.
238                 #    These will be treated block-level when the start tag is alone on 
239                 #    its line, otherwise they're not matched here and will be taken as 
240                 #    inline later.
241                 # *  List "b" is made of tags which are always block-level;
242                 #
243                 $block_tags_a_re = 'ins|del';
244                 $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
245                                                    'script|noscript|form|fieldset|iframe|math|svg|'.
246                                                    'article|section|nav|aside|hgroup|header|footer|'.
247                                                    'figure';
249                 # Regular expression for the content of a block tag.
250                 $nested_tags_level = 4;
251                 $attr = '
252                         (?>                             # optional tag attributes
253                           \s                    # starts with whitespace
254                           (?>
255                                 [^>"/]+         # text outside quotes
256                           |
257                                 /+(?!>)         # slash not followed by ">"
258                           |
259                                 "[^"]*"         # text inside double quotes (tolerate ">")
260                           |
261                                 \'[^\']*\'      # text inside single quotes (tolerate ">")
262                           )*
263                         )?      
264                         ';
265                 $content =
266                         str_repeat('
267                                 (?>
268                                   [^<]+                 # content without tag
269                                 |
270                                   <\2                   # nested opening tag
271                                         '.$attr.'       # attributes
272                                         (?>
273                                           />
274                                         |
275                                           >', $nested_tags_level).      # end of opening tag
276                                           '.*?'.                                        # last level nested tag content
277                         str_repeat('
278                                           </\2\s*>      # closing nested tag
279                                         )
280                                   |                             
281                                         <(?!/\2\s*>     # other tags with a different name
282                                   )
283                                 )*',
284                                 $nested_tags_level);
285                 $content2 = str_replace('\2', '\3', $content);
287                 # First, look for nested blocks, e.g.:
288                 #       <div>
289                 #               <div>
290                 #               tags for inner block must be indented.
291                 #               </div>
292                 #       </div>
293                 #
294                 # The outermost tags must start at the left margin for this to match, and
295                 # the inner nested divs must be indented.
296                 # We need to do this before the next, more liberal match, because the next
297                 # match will start at the first `<div>` and stop at the first `</div>`.
298                 $text = preg_replace_callback('{(?>
299                         (?>
300                                 (?<=\n\n)               # Starting after a blank line
301                                 |                               # or
302                                 \A\n?                   # the beginning of the doc
303                         )
304                         (                                               # save in $1
306                           # Match from `\n<tag>` to `</tag>\n`, handling nested tags 
307                           # in between.
308                                         
309                                                 [ ]{0,'.$less_than_tab.'}
310                                                 <('.$block_tags_b_re.')# start tag = $2
311                                                 '.$attr.'>                      # attributes followed by > and \n
312                                                 '.$content.'            # content, support nesting
313                                                 </\2>                           # the matching end tag
314                                                 [ ]*                            # trailing spaces/tabs
315                                                 (?=\n+|\Z)      # followed by a newline or end of document
317                         | # Special version for tags of group a.
319                                                 [ ]{0,'.$less_than_tab.'}
320                                                 <('.$block_tags_a_re.')# start tag = $3
321                                                 '.$attr.'>[ ]*\n        # attributes followed by >
322                                                 '.$content2.'           # content, support nesting
323                                                 </\3>                           # the matching end tag
324                                                 [ ]*                            # trailing spaces/tabs
325                                                 (?=\n+|\Z)      # followed by a newline or end of document
326                                         
327                         | # Special case just for <hr />. It was easier to make a special 
328                           # case than to make the other regex more complicated.
329                         
330                                                 [ ]{0,'.$less_than_tab.'}
331                                                 <(hr)                           # start tag = $2
332                                                 '.$attr.'                       # attributes
333                                                 /?>                                     # the matching end tag
334                                                 [ ]*
335                                                 (?=\n{2,}|\Z)           # followed by a blank line or end of document
336                         
337                         | # Special case for standalone HTML comments:
338                         
339                                         [ ]{0,'.$less_than_tab.'}
340                                         (?s:
341                                                 <!-- .*? -->
342                                         )
343                                         [ ]*
344                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
345                         
346                         | # PHP and ASP-style processor instructions (<? and <%)
347                         
348                                         [ ]{0,'.$less_than_tab.'}
349                                         (?s:
350                                                 <([?%])                 # $2
351                                                 .*?
352                                                 \2>
353                                         )
354                                         [ ]*
355                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
356                                         
357                         )
358                         )}Sxmi',
359                         array(&$this, '_hashHTMLBlocks_callback'),
360                         $text);
362                 return $text;
363         }
364         protected function _hashHTMLBlocks_callback($matches) {
365                 $text = $matches[1];
366                 $key  = $this->hashBlock($text);
367                 return "\n\n$key\n\n";
368         }
369         
370         
371         protected function hashPart($text, $boundary = 'X') {
372         #
373         # Called whenever a tag must be hashed when a function insert an atomic 
374         # element in the text stream. Passing $text to through this function gives
375         # a unique text-token which will be reverted back when calling unhash.
376         #
377         # The $boundary argument specify what character should be used to surround
378         # the token. By convension, "B" is used for block elements that needs not
379         # to be wrapped into paragraph tags at the end, ":" is used for elements
380         # that are word separators and "X" is used in the general case.
381         #
382                 # Swap back any tag hash found in $text so we do not have to `unhash`
383                 # multiple times at the end.
384                 $text = $this->unhash($text);
385                 
386                 # Then hash the block.
387                 static $i = 0;
388                 $key = "$boundary\x1A" . ++$i . $boundary;
389                 $this->html_hashes[$key] = $text;
390                 return $key; # String that will replace the tag.
391         }
394         protected function hashBlock($text) {
395         #
396         # Shortcut function for hashPart with block-level boundaries.
397         #
398                 return $this->hashPart($text, 'B');
399         }
402         protected $block_gamut = array(
403         #
404         # These are all the transformations that form block-level
405         # tags like paragraphs, headers, and list items.
406         #
407                 "doHeaders"         => 10,
408                 "doHorizontalRules" => 20,
409                 
410                 "doLists"           => 40,
411                 "doCodeBlocks"      => 50,
412                 "doBlockQuotes"     => 60,
413                 );
415         protected function runBlockGamut($text) {
416         #
417         # Run block gamut tranformations.
418         #
419                 # We need to escape raw HTML in Markdown source before doing anything 
420                 # else. This need to be done for each block, and not only at the 
421                 # begining in the Markdown function since hashed blocks can be part of
422                 # list items and could have been indented. Indented blocks would have 
423                 # been seen as a code block in a previous pass of hashHTMLBlocks.
424                 $text = $this->hashHTMLBlocks($text);
425                 
426                 return $this->runBasicBlockGamut($text);
427         }
428         
429         protected function runBasicBlockGamut($text) {
430         #
431         # Run block gamut tranformations, without hashing HTML blocks. This is 
432         # useful when HTML blocks are known to be already hashed, like in the first
433         # whole-document pass.
434         #
435                 foreach ($this->block_gamut as $method => $priority) {
436                         $text = $this->$method($text);
437                 }
438                 
439                 # Finally form paragraph and restore hashed blocks.
440                 $text = $this->formParagraphs($text);
442                 return $text;
443         }
444         
445         
446         protected function doHorizontalRules($text) {
447                 # Do Horizontal Rules:
448                 return preg_replace(
449                         '{
450                                 ^[ ]{0,3}       # Leading space
451                                 ([-*_])         # $1: First marker
452                                 (?>                     # Repeated marker group
453                                         [ ]{0,2}        # Zero, one, or two spaces.
454                                         \1                      # Marker character
455                                 ){2,}           # Group repeated at least twice
456                                 [ ]*            # Tailing spaces
457                                 $                       # End of line.
458                         }mx',
459                         "\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n", 
460                         $text);
461         }
464         protected $span_gamut = array(
465         #
466         # These are all the transformations that occur *within* block-level
467         # tags like paragraphs, headers, and list items.
468         #
469                 # Process character escapes, code spans, and inline HTML
470                 # in one shot.
471                 "parseSpan"           => -30,
473                 # Process anchor and image tags. Images must come first,
474                 # because ![foo][f] looks like an anchor.
475                 "doImages"            =>  10,
476                 "doAnchors"           =>  20,
477                 
478                 # Make links out of things like `<http://example.com/>`
479                 # Must come after doAnchors, because you can use < and >
480                 # delimiters in inline links like [this](<url>).
481                 "doAutoLinks"         =>  30,
482                 "encodeAmpsAndAngles" =>  40,
484                 "doItalicsAndBold"    =>  50,
485                 "doHardBreaks"        =>  60,
486                 );
488         protected function runSpanGamut($text) {
489         #
490         # Run span gamut tranformations.
491         #
492                 foreach ($this->span_gamut as $method => $priority) {
493                         $text = $this->$method($text);
494                 }
496                 return $text;
497         }
498         
499         
500         protected function doHardBreaks($text) {
501                 # Do hard breaks:
502                 return preg_replace_callback('/ {2,}\n/', 
503                         array(&$this, '_doHardBreaks_callback'), $text);
504         }
505         protected function _doHardBreaks_callback($matches) {
506                 return $this->hashPart("<br$this->empty_element_suffix\n");
507         }
510         protected function doAnchors($text) {
511         #
512         # Turn Markdown link shortcuts into XHTML <a> tags.
513         #
514                 if ($this->in_anchor) return $text;
515                 $this->in_anchor = true;
516                 
517                 #
518                 # First, handle reference-style links: [link text] [id]
519                 #
520                 $text = preg_replace_callback('{
521                         (                                       # wrap whole match in $1
522                           \[
523                                 ('.$this->nested_brackets_re.') # link text = $2
524                           \]
526                           [ ]?                          # one optional space
527                           (?:\n[ ]*)?           # one optional newline followed by spaces
529                           \[
530                                 (.*?)           # id = $3
531                           \]
532                         )
533                         }xs',
534                         array(&$this, '_doAnchors_reference_callback'), $text);
536                 #
537                 # Next, inline-style links: [link text](url "optional title")
538                 #
539                 $text = preg_replace_callback('{
540                         (                               # wrap whole match in $1
541                           \[
542                                 ('.$this->nested_brackets_re.') # link text = $2
543                           \]
544                           \(                    # literal paren
545                                 [ \n]*
546                                 (?:
547                                         <(.+?)> # href = $3
548                                 |
549                                         ('.$this->nested_url_parenthesis_re.')  # href = $4
550                                 )
551                                 [ \n]*
552                                 (                       # $5
553                                   ([\'"])       # quote char = $6
554                                   (.*?)         # Title = $7
555                                   \6            # matching quote
556                                   [ \n]*        # ignore any spaces/tabs between closing quote and )
557                                 )?                      # title is optional
558                           \)
559                         )
560                         }xs',
561                         array(&$this, '_doAnchors_inline_callback'), $text);
563                 #
564                 # Last, handle reference-style shortcuts: [link text]
565                 # These must come last in case you've also got [link text][1]
566                 # or [link text](/foo)
567                 #
568                 $text = preg_replace_callback('{
569                         (                                       # wrap whole match in $1
570                           \[
571                                 ([^\[\]]+)              # link text = $2; can\'t contain [ or ]
572                           \]
573                         )
574                         }xs',
575                         array(&$this, '_doAnchors_reference_callback'), $text);
577                 $this->in_anchor = false;
578                 return $text;
579         }
580         protected function _doAnchors_reference_callback($matches) {
581                 $whole_match =  $matches[1];
582                 $link_text   =  $matches[2];
583                 $link_id     =& $matches[3];
585                 if ($link_id == "") {
586                         # for shortcut links like [this][] or [this].
587                         $link_id = $link_text;
588                 }
589                 
590                 # lower-case and turn embedded newlines into spaces
591                 $link_id = strtolower($link_id);
592                 $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
594                 if (isset($this->urls[$link_id])) {
595                         $url = $this->urls[$link_id];
596                         $url = $this->encodeAttribute($url);
597                         
598                         $result = "<a href=\"$url\"";
599                         if ( isset( $this->titles[$link_id] ) ) {
600                                 $title = $this->titles[$link_id];
601                                 $title = $this->encodeAttribute($title);
602                                 $result .=  " title=\"$title\"";
603                         }
604                 
605                         $link_text = $this->runSpanGamut($link_text);
606                         $result .= ">$link_text</a>";
607                         $result = $this->hashPart($result);
608                 }
609                 else {
610                         $result = $whole_match;
611                 }
612                 return $result;
613         }
614         protected function _doAnchors_inline_callback($matches) {
615                 $whole_match    =  $matches[1];
616                 $link_text              =  $this->runSpanGamut($matches[2]);
617                 $url                    =  $matches[3] == '' ? $matches[4] : $matches[3];
618                 $title                  =& $matches[7];
620                 $url = $this->encodeAttribute($url);
622                 $result = "<a href=\"$url\"";
623                 if (isset($title)) {
624                         $title = $this->encodeAttribute($title);
625                         $result .=  " title=\"$title\"";
626                 }
627                 
628                 $link_text = $this->runSpanGamut($link_text);
629                 $result .= ">$link_text</a>";
631                 return $this->hashPart($result);
632         }
635         protected function doImages($text) {
636         #
637         # Turn Markdown image shortcuts into <img> tags.
638         #
639                 #
640                 # First, handle reference-style labeled images: ![alt text][id]
641                 #
642                 $text = preg_replace_callback('{
643                         (                               # wrap whole match in $1
644                           !\[
645                                 ('.$this->nested_brackets_re.')         # alt text = $2
646                           \]
648                           [ ]?                          # one optional space
649                           (?:\n[ ]*)?           # one optional newline followed by spaces
651                           \[
652                                 (.*?)           # id = $3
653                           \]
655                         )
656                         }xs', 
657                         array(&$this, '_doImages_reference_callback'), $text);
659                 #
660                 # Next, handle inline images:  ![alt text](url "optional title")
661                 # Don't forget: encode * and _
662                 #
663                 $text = preg_replace_callback('{
664                         (                               # wrap whole match in $1
665                           !\[
666                                 ('.$this->nested_brackets_re.')         # alt text = $2
667                           \]
668                           \s?                   # One optional whitespace character
669                           \(                    # literal paren
670                                 [ \n]*
671                                 (?:
672                                         <(\S*)> # src url = $3
673                                 |
674                                         ('.$this->nested_url_parenthesis_re.')  # src url = $4
675                                 )
676                                 [ \n]*
677                                 (                       # $5
678                                   ([\'"])       # quote char = $6
679                                   (.*?)         # title = $7
680                                   \6            # matching quote
681                                   [ \n]*
682                                 )?                      # title is optional
683                           \)
684                         )
685                         }xs',
686                         array(&$this, '_doImages_inline_callback'), $text);
688                 return $text;
689         }
690         protected function _doImages_reference_callback($matches) {
691                 $whole_match = $matches[1];
692                 $alt_text    = $matches[2];
693                 $link_id     = strtolower($matches[3]);
695                 if ($link_id == "") {
696                         $link_id = strtolower($alt_text); # for shortcut links like ![this][].
697                 }
699                 $alt_text = $this->encodeAttribute($alt_text);
700                 if (isset($this->urls[$link_id])) {
701                         $url = $this->encodeAttribute($this->urls[$link_id]);
702                         $result = "<img src=\"$url\" alt=\"$alt_text\"";
703                         if (isset($this->titles[$link_id])) {
704                                 $title = $this->titles[$link_id];
705                                 $title = $this->encodeAttribute($title);
706                                 $result .=  " title=\"$title\"";
707                         }
708                         $result .= $this->empty_element_suffix;
709                         $result = $this->hashPart($result);
710                 }
711                 else {
712                         # If there's no such link ID, leave intact:
713                         $result = $whole_match;
714                 }
716                 return $result;
717         }
718         protected function _doImages_inline_callback($matches) {
719                 $whole_match    = $matches[1];
720                 $alt_text               = $matches[2];
721                 $url                    = $matches[3] == '' ? $matches[4] : $matches[3];
722                 $title                  =& $matches[7];
724                 $alt_text = $this->encodeAttribute($alt_text);
725                 $url = $this->encodeAttribute($url);
726                 $result = "<img src=\"$url\" alt=\"$alt_text\"";
727                 if (isset($title)) {
728                         $title = $this->encodeAttribute($title);
729                         $result .=  " title=\"$title\""; # $title already quoted
730                 }
731                 $result .= $this->empty_element_suffix;
733                 return $this->hashPart($result);
734         }
737         protected function doHeaders($text) {
738                 # Setext-style headers:
739                 #         Header 1
740                 #         ========
741                 #  
742                 #         Header 2
743                 #         --------
744                 #
745                 $text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
746                         array(&$this, '_doHeaders_callback_setext'), $text);
748                 # atx-style headers:
749                 #       # Header 1
750                 #       ## Header 2
751                 #       ## Header 2 with closing hashes ##
752                 #       ...
753                 #       ###### Header 6
754                 #
755                 $text = preg_replace_callback('{
756                                 ^(\#{1,6})      # $1 = string of #\'s
757                                 [ ]*
758                                 (.+?)           # $2 = Header text
759                                 [ ]*
760                                 \#*                     # optional closing #\'s (not counted)
761                                 \n+
762                         }xm',
763                         array(&$this, '_doHeaders_callback_atx'), $text);
765                 return $text;
766         }
767         protected function _doHeaders_callback_setext($matches) {
768                 # Terrible hack to check we haven't found an empty list item.
769                 if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1]))
770                         return $matches[0];
771                 
772                 $level = $matches[2]{0} == '=' ? 1 : 2;
773                 $block = "<h$level>".$this->runSpanGamut($matches[1])."</h$level>";
774                 return "\n" . $this->hashBlock($block) . "\n\n";
775         }
776         protected function _doHeaders_callback_atx($matches) {
777                 $level = strlen($matches[1]);
778                 $block = "<h$level>".$this->runSpanGamut($matches[2])."</h$level>";
779                 return "\n" . $this->hashBlock($block) . "\n\n";
780         }
783         protected function doLists($text) {
784         #
785         # Form HTML ordered (numbered) and unordered (bulleted) lists.
786         #
787                 $less_than_tab = $this->tab_width - 1;
789                 # Re-usable patterns to match list item bullets and number markers:
790                 $marker_ul_re  = '[*+-]';
791                 $marker_ol_re  = '\d+[\.]';
792                 $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
794                 $markers_relist = array(
795                         $marker_ul_re => $marker_ol_re,
796                         $marker_ol_re => $marker_ul_re,
797                         );
799                 foreach ($markers_relist as $marker_re => $other_marker_re) {
800                         # Re-usable pattern to match any entirel ul or ol list:
801                         $whole_list_re = '
802                                 (                                                               # $1 = whole list
803                                   (                                                             # $2
804                                         ([ ]{0,'.$less_than_tab.'})     # $3 = number of spaces
805                                         ('.$marker_re.')                        # $4 = first list item marker
806                                         [ ]+
807                                   )
808                                   (?s:.+?)
809                                   (                                                             # $5
810                                           \z
811                                         |
812                                           \n{2,}
813                                           (?=\S)
814                                           (?!                                           # Negative lookahead for another list item marker
815                                                 [ ]*
816                                                 '.$marker_re.'[ ]+
817                                           )
818                                         |
819                                           (?=                                           # Lookahead for another kind of list
820                                             \n
821                                                 \3                                              # Must have the same indentation
822                                                 '.$other_marker_re.'[ ]+
823                                           )
824                                   )
825                                 )
826                         '; // mx
827                         
828                         # We use a different prefix before nested lists than top-level lists.
829                         # See extended comment in _ProcessListItems().
830                 
831                         if ($this->list_level) {
832                                 $text = preg_replace_callback('{
833                                                 ^
834                                                 '.$whole_list_re.'
835                                         }mx',
836                                         array(&$this, '_doLists_callback'), $text);
837                         }
838                         else {
839                                 $text = preg_replace_callback('{
840                                                 (?:(?<=\n)\n|\A\n?) # Must eat the newline
841                                                 '.$whole_list_re.'
842                                         }mx',
843                                         array(&$this, '_doLists_callback'), $text);
844                         }
845                 }
847                 return $text;
848         }
849         protected function _doLists_callback($matches) {
850                 # Re-usable patterns to match list item bullets and number markers:
851                 $marker_ul_re  = '[*+-]';
852                 $marker_ol_re  = '\d+[\.]';
853                 $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
854                 
855                 $list = $matches[1];
856                 $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
857                 
858                 $marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re );
859                 
860                 $list .= "\n";
861                 $result = $this->processListItems($list, $marker_any_re);
862                 
863                 $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
864                 return "\n". $result ."\n\n";
865         }
867         protected $list_level = 0;
869         protected function processListItems($list_str, $marker_any_re) {
870         #
871         #       Process the contents of a single ordered or unordered list, splitting it
872         #       into individual list items.
873         #
874                 # The $this->list_level global keeps track of when we're inside a list.
875                 # Each time we enter a list, we increment it; when we leave a list,
876                 # we decrement. If it's zero, we're not in a list anymore.
877                 #
878                 # We do this because when we're not inside a list, we want to treat
879                 # something like this:
880                 #
881                 #               I recommend upgrading to version
882                 #               8. Oops, now this line is treated
883                 #               as a sub-list.
884                 #
885                 # As a single paragraph, despite the fact that the second line starts
886                 # with a digit-period-space sequence.
887                 #
888                 # Whereas when we're inside a list (or sub-list), that line will be
889                 # treated as the start of a sub-list. What a kludge, huh? This is
890                 # an aspect of Markdown's syntax that's hard to parse perfectly
891                 # without resorting to mind-reading. Perhaps the solution is to
892                 # change the syntax rules such that sub-lists must start with a
893                 # starting cardinal number; e.g. "1." or "a.".
894                 
895                 $this->list_level++;
897                 # trim trailing blank lines:
898                 $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
900                 $list_str = preg_replace_callback('{
901                         (\n)?                                                   # leading line = $1
902                         (^[ ]*)                                                 # leading whitespace = $2
903                         ('.$marker_any_re.'                             # list marker and space = $3
904                                 (?:[ ]+|(?=\n)) # space only required if item is not empty
905                         )
906                         ((?s:.*?))                                              # list item text   = $4
907                         (?:(\n+(?=\n))|\n)                              # tailing blank line = $5
908                         (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
909                         }xm',
910                         array(&$this, '_processListItems_callback'), $list_str);
912                 $this->list_level--;
913                 return $list_str;
914         }
915         protected function _processListItems_callback($matches) {
916                 $item = $matches[4];
917                 $leading_line =& $matches[1];
918                 $leading_space =& $matches[2];
919                 $marker_space = $matches[3];
920                 $tailing_blank_line =& $matches[5];
922                 if ($leading_line || $tailing_blank_line || 
923                         preg_match('/\n{2,}/', $item))
924                 {
925                         # Replace marker with the appropriate whitespace indentation
926                         $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
927                         $item = $this->runBlockGamut($this->outdent($item)."\n");
928                 }
929                 else {
930                         # Recursion for sub-lists:
931                         $item = $this->doLists($this->outdent($item));
932                         $item = preg_replace('/\n+$/', '', $item);
933                         $item = $this->runSpanGamut($item);
934                 }
936                 return "<li>" . $item . "</li>\n";
937         }
940         protected function doCodeBlocks($text) {
941         #
942         #       Process Markdown `<pre><code>` blocks.
943         #
944                 $text = preg_replace_callback('{
945                                 (?:\n\n|\A\n?)
946                                 (                   # $1 = the code block -- one or more lines, starting with a space/tab
947                                   (?>
948                                         [ ]{'.$this->tab_width.'}  # Lines must start with a tab or a tab-width of spaces
949                                         .*\n+
950                                   )+
951                                 )
952                                 ((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
953                         }xm',
954                         array(&$this, '_doCodeBlocks_callback'), $text);
956                 return $text;
957         }
958         protected function _doCodeBlocks_callback($matches) {
959                 $codeblock = $matches[1];
961                 $codeblock = $this->outdent($codeblock);
962                 $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);
964                 # trim leading newlines and trailing newlines
965                 $codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);
967                 $codeblock = "<pre><code>$codeblock\n</code></pre>";
968                 return "\n\n".$this->hashBlock($codeblock)."\n\n";
969         }
972         protected function makeCodeSpan($code) {
973         #
974         # Create a code span markup for $code. Called from handleSpanToken.
975         #
976                 $code = htmlspecialchars(trim($code), ENT_NOQUOTES);
977                 return $this->hashPart("<code>$code</code>");
978         }
981         protected $em_relist = array(
982                 ''  => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?=\S|$)(?![\.,:;]\s)',
983                 '*' => '(?<=\S|^)(?<!\*)\*(?!\*)',
984                 '_' => '(?<=\S|^)(?<!_)_(?!_)',
985                 );
986         protected $strong_relist = array(
987                 ''   => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?=\S|$)(?![\.,:;]\s)',
988                 '**' => '(?<=\S|^)(?<!\*)\*\*(?!\*)',
989                 '__' => '(?<=\S|^)(?<!_)__(?!_)',
990                 );
991         protected $em_strong_relist = array(
992                 ''    => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?=\S|$)(?![\.,:;]\s)',
993                 '***' => '(?<=\S|^)(?<!\*)\*\*\*(?!\*)',
994                 '___' => '(?<=\S|^)(?<!_)___(?!_)',
995                 );
996         protected $em_strong_prepared_relist;
997         
998         protected function prepareItalicsAndBold() {
999         #
1000         # Prepare regular expressions for searching emphasis tokens in any
1001         # context.
1002         #
1003                 foreach ($this->em_relist as $em => $em_re) {
1004                         foreach ($this->strong_relist as $strong => $strong_re) {
1005                                 # Construct list of allowed token expressions.
1006                                 $token_relist = array();
1007                                 if (isset($this->em_strong_relist["$em$strong"])) {
1008                                         $token_relist[] = $this->em_strong_relist["$em$strong"];
1009                                 }
1010                                 $token_relist[] = $em_re;
1011                                 $token_relist[] = $strong_re;
1012                                 
1013                                 # Construct master expression from list.
1014                                 $token_re = '{('. implode('|', $token_relist) .')}';
1015                                 $this->em_strong_prepared_relist["$em$strong"] = $token_re;
1016                         }
1017                 }
1018         }
1019         
1020         protected function doItalicsAndBold($text) {
1021                 $token_stack = array('');
1022                 $text_stack = array('');
1023                 $em = '';
1024                 $strong = '';
1025                 $tree_char_em = false;
1026                 
1027                 while (1) {
1028                         #
1029                         # Get prepared regular expression for seraching emphasis tokens
1030                         # in current context.
1031                         #
1032                         $token_re = $this->em_strong_prepared_relist["$em$strong"];
1033                         
1034                         #
1035                         # Each loop iteration search for the next emphasis token. 
1036                         # Each token is then passed to handleSpanToken.
1037                         #
1038                         $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
1039                         $text_stack[0] .= $parts[0];
1040                         $token =& $parts[1];
1041                         $text =& $parts[2];
1042                         
1043                         if (empty($token)) {
1044                                 # Reached end of text span: empty stack without emitting.
1045                                 # any more emphasis.
1046                                 while ($token_stack[0]) {
1047                                         $text_stack[1] .= array_shift($token_stack);
1048                                         $text_stack[0] .= array_shift($text_stack);
1049                                 }
1050                                 break;
1051                         }
1052                         
1053                         $token_len = strlen($token);
1054                         if ($tree_char_em) {
1055                                 # Reached closing marker while inside a three-char emphasis.
1056                                 if ($token_len == 3) {
1057                                         # Three-char closing marker, close em and strong.
1058                                         array_shift($token_stack);
1059                                         $span = array_shift($text_stack);
1060                                         $span = $this->runSpanGamut($span);
1061                                         $span = "<strong><em>$span</em></strong>";
1062                                         $text_stack[0] .= $this->hashPart($span);
1063                                         $em = '';
1064                                         $strong = '';
1065                                 } else {
1066                                         # Other closing marker: close one em or strong and
1067                                         # change current token state to match the other
1068                                         $token_stack[0] = str_repeat($token{0}, 3-$token_len);
1069                                         $tag = $token_len == 2 ? "strong" : "em";
1070                                         $span = $text_stack[0];
1071                                         $span = $this->runSpanGamut($span);
1072                                         $span = "<$tag>$span</$tag>";
1073                                         $text_stack[0] = $this->hashPart($span);
1074                                         $$tag = ''; # $$tag stands for $em or $strong
1075                                 }
1076                                 $tree_char_em = false;
1077                         } else if ($token_len == 3) {
1078                                 if ($em) {
1079                                         # Reached closing marker for both em and strong.
1080                                         # Closing strong marker:
1081                                         for ($i = 0; $i < 2; ++$i) {
1082                                                 $shifted_token = array_shift($token_stack);
1083                                                 $tag = strlen($shifted_token) == 2 ? "strong" : "em";
1084                                                 $span = array_shift($text_stack);
1085                                                 $span = $this->runSpanGamut($span);
1086                                                 $span = "<$tag>$span</$tag>";
1087                                                 $text_stack[0] .= $this->hashPart($span);
1088                                                 $$tag = ''; # $$tag stands for $em or $strong
1089                                         }
1090                                 } else {
1091                                         # Reached opening three-char emphasis marker. Push on token 
1092                                         # stack; will be handled by the special condition above.
1093                                         $em = $token{0};
1094                                         $strong = "$em$em";
1095                                         array_unshift($token_stack, $token);
1096                                         array_unshift($text_stack, '');
1097                                         $tree_char_em = true;
1098                                 }
1099                         } else if ($token_len == 2) {
1100                                 if ($strong) {
1101                                         # Unwind any dangling emphasis marker:
1102                                         if (strlen($token_stack[0]) == 1) {
1103                                                 $text_stack[1] .= array_shift($token_stack);
1104                                                 $text_stack[0] .= array_shift($text_stack);
1105                                         }
1106                                         # Closing strong marker:
1107                                         array_shift($token_stack);
1108                                         $span = array_shift($text_stack);
1109                                         $span = $this->runSpanGamut($span);
1110                                         $span = "<strong>$span</strong>";
1111                                         $text_stack[0] .= $this->hashPart($span);
1112                                         $strong = '';
1113                                 } else {
1114                                         array_unshift($token_stack, $token);
1115                                         array_unshift($text_stack, '');
1116                                         $strong = $token;
1117                                 }
1118                         } else {
1119                                 # Here $token_len == 1
1120                                 if ($em) {
1121                                         if (strlen($token_stack[0]) == 1) {
1122                                                 # Closing emphasis marker:
1123                                                 array_shift($token_stack);
1124                                                 $span = array_shift($text_stack);
1125                                                 $span = $this->runSpanGamut($span);
1126                                                 $span = "<em>$span</em>";
1127                                                 $text_stack[0] .= $this->hashPart($span);
1128                                                 $em = '';
1129                                         } else {
1130                                                 $text_stack[0] .= $token;
1131                                         }
1132                                 } else {
1133                                         array_unshift($token_stack, $token);
1134                                         array_unshift($text_stack, '');
1135                                         $em = $token;
1136                                 }
1137                         }
1138                 }
1139                 return $text_stack[0];
1140         }
1143         protected function doBlockQuotes($text) {
1144                 $text = preg_replace_callback('/
1145                           (                                                             # Wrap whole match in $1
1146                                 (?>
1147                                   ^[ ]*>[ ]?                    # ">" at the start of a line
1148                                         .+\n                                    # rest of the first line
1149                                   (.+\n)*                                       # subsequent consecutive lines
1150                                   \n*                                           # blanks
1151                                 )+
1152                           )
1153                         /xm',
1154                         array(&$this, '_doBlockQuotes_callback'), $text);
1156                 return $text;
1157         }
1158         protected function _doBlockQuotes_callback($matches) {
1159                 $bq = $matches[1];
1160                 # trim one level of quoting - trim whitespace-only lines
1161                 $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
1162                 $bq = $this->runBlockGamut($bq);                # recurse
1164                 $bq = preg_replace('/^/m', "  ", $bq);
1165                 # These leading spaces cause problem with <pre> content, 
1166                 # so we need to fix that:
1167                 $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx', 
1168                         array(&$this, '_doBlockQuotes_callback2'), $bq);
1170                 return "\n". $this->hashBlock("<blockquote>\n$bq\n</blockquote>")."\n\n";
1171         }
1172         protected function _doBlockQuotes_callback2($matches) {
1173                 $pre = $matches[1];
1174                 $pre = preg_replace('/^  /m', '', $pre);
1175                 return $pre;
1176         }
1179         protected function formParagraphs($text) {
1180         #
1181         #       Params:
1182         #               $text - string to process with html <p> tags
1183         #
1184                 # Strip leading and trailing lines:
1185                 $text = preg_replace('/\A\n+|\n+\z/', '', $text);
1187                 $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
1189                 #
1190                 # Wrap <p> tags and unhashify HTML blocks
1191                 #
1192                 foreach ($grafs as $key => $value) {
1193                         if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
1194                                 # Is a paragraph.
1195                                 $value = $this->runSpanGamut($value);
1196                                 $value = preg_replace('/^([ ]*)/', "<p>", $value);
1197                                 $value .= "</p>";
1198                                 $grafs[$key] = $this->unhash($value);
1199                         }
1200                         else {
1201                                 # Is a block.
1202                                 # Modify elements of @grafs in-place...
1203                                 $graf = $value;
1204                                 $block = $this->html_hashes[$graf];
1205                                 $graf = $block;
1206 //                              if (preg_match('{
1207 //                                      \A
1208 //                                      (                                                       # $1 = <div> tag
1209 //                                        <div  \s+
1210 //                                        [^>]*
1211 //                                        \b
1212 //                                        markdown\s*=\s*  ([\'"])      #       $2 = attr quote char
1213 //                                        1
1214 //                                        \2
1215 //                                        [^>]*
1216 //                                        >
1217 //                                      )
1218 //                                      (                                                       # $3 = contents
1219 //                                      .*
1220 //                                      )
1221 //                                      (</div>)                                        # $4 = closing tag
1222 //                                      \z
1223 //                                      }xs', $block, $matches))
1224 //                              {
1225 //                                      list(, $div_open, , $div_content, $div_close) = $matches;
1226 //
1227 //                                      # We can't call Markdown(), because that resets the hash;
1228 //                                      # that initialization code should be pulled into its own sub, though.
1229 //                                      $div_content = $this->hashHTMLBlocks($div_content);
1230 //                                      
1231 //                                      # Run document gamut methods on the content.
1232 //                                      foreach ($this->document_gamut as $method => $priority) {
1233 //                                              $div_content = $this->$method($div_content);
1234 //                                      }
1235 //
1236 //                                      $div_open = preg_replace(
1237 //                                              '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
1238 //
1239 //                                      $graf = $div_open . "\n" . $div_content . "\n" . $div_close;
1240 //                              }
1241                                 $grafs[$key] = $graf;
1242                         }
1243                 }
1245                 return implode("\n\n", $grafs);
1246         }
1249         protected function encodeAttribute($text) {
1250         #
1251         # Encode text for a double-quoted HTML attribute. This function
1252         # is *not* suitable for attributes enclosed in single quotes.
1253         #
1254                 $text = $this->encodeAmpsAndAngles($text);
1255                 $text = str_replace('"', '&quot;', $text);
1256                 return $text;
1257         }
1258         
1259         
1260         protected function encodeAmpsAndAngles($text) {
1261         #
1262         # Smart processing for ampersands and angle brackets that need to 
1263         # be encoded. Valid character entities are left alone unless the
1264         # no-entities mode is set.
1265         #
1266                 if ($this->no_entities) {
1267                         $text = str_replace('&', '&amp;', $text);
1268                 } else {
1269                         # Ampersand-encoding based entirely on Nat Irons's Amputator
1270                         # MT plugin: <http://bumppo.net/projects/amputator/>
1271                         $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/', 
1272                                                                 '&amp;', $text);;
1273                 }
1274                 # Encode remaining <'s
1275                 $text = str_replace('<', '&lt;', $text);
1277                 return $text;
1278         }
1281         protected function doAutoLinks($text) {
1282                 $text = preg_replace_callback('{<((https?|ftp|dict):[^\'">\s]+)>}i', 
1283                         array(&$this, '_doAutoLinks_url_callback'), $text);
1285                 # Email addresses: <address@domain.foo>
1286                 $text = preg_replace_callback('{
1287                         <
1288                         (?:mailto:)?
1289                         (
1290                                 (?:
1291                                         [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+
1292                                 |
1293                                         ".*?"
1294                                 )
1295                                 \@
1296                                 (?:
1297                                         [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
1298                                 |
1299                                         \[[\d.a-fA-F:]+\]       # IPv4 & IPv6
1300                                 )
1301                         )
1302                         >
1303                         }xi',
1304                         array(&$this, '_doAutoLinks_email_callback'), $text);
1306                 return $text;
1307         }
1308         protected function _doAutoLinks_url_callback($matches) {
1309                 $url = $this->encodeAttribute($matches[1]);
1310                 $link = "<a href=\"$url\">$url</a>";
1311                 return $this->hashPart($link);
1312         }
1313         protected function _doAutoLinks_email_callback($matches) {
1314                 $address = $matches[1];
1315                 $link = $this->encodeEmailAddress($address);
1316                 return $this->hashPart($link);
1317         }
1320         protected function encodeEmailAddress($addr) {
1321         #
1322         #       Input: an email address, e.g. "foo@example.com"
1323         #
1324         #       Output: the email address as a mailto link, with each character
1325         #               of the address encoded as either a decimal or hex entity, in
1326         #               the hopes of foiling most address harvesting spam bots. E.g.:
1327         #
1328         #         <p><a href="&#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
1329         #        &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
1330         #        &#x6d;">&#x66;o&#111;&#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;
1331         #        &#101;&#46;&#x63;&#111;&#x6d;</a></p>
1332         #
1333         #       Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
1334         #   With some optimizations by Milian Wolff.
1335         #
1336                 $addr = "mailto:" . $addr;
1337                 $chars = preg_split('/(?<!^)(?!$)/', $addr);
1338                 $seed = (int)abs(crc32($addr) / strlen($addr)); # Deterministic seed.
1339                 
1340                 foreach ($chars as $key => $char) {
1341                         $ord = ord($char);
1342                         # Ignore non-ascii chars.
1343                         if ($ord < 128) {
1344                                 $r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
1345                                 # roughly 10% raw, 45% hex, 45% dec
1346                                 # '@' *must* be encoded. I insist.
1347                                 if ($r > 90 && $char != '@') /* do nothing */;
1348                                 else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
1349                                 else              $chars[$key] = '&#'.$ord.';';
1350                         }
1351                 }
1352                 
1353                 $addr = implode('', $chars);
1354                 $text = implode('', array_slice($chars, 7)); # text without `mailto:`
1355                 $addr = "<a href=\"$addr\">$text</a>";
1357                 return $addr;
1358         }
1361         protected function parseSpan($str) {
1362         #
1363         # Take the string $str and parse it into tokens, hashing embeded HTML,
1364         # escaped characters and handling code spans.
1365         #
1366                 $output = '';
1367                 
1368                 $span_re = '{
1369                                 (
1370                                         \\\\'.$this->escape_chars_re.'
1371                                 |
1372                                         (?<![`\\\\])
1373                                         `+                                              # code span marker
1374                         '.( $this->no_markup ? '' : '
1375                                 |
1376                                         <!--    .*?     -->             # comment
1377                                 |
1378                                         <\?.*?\?> | <%.*?%>             # processing instruction
1379                                 |
1380                                         <[!$]?[-a-zA-Z0-9:_]+   # regular tags
1381                                         (?>
1382                                                 \s
1383                                                 (?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
1384                                         )?
1385                                         >
1386                                 |
1387                                         <[-a-zA-Z0-9:_]+\s*/> # xml-style empty tag
1388                                 |
1389                                         </[-a-zA-Z0-9:_]+\s*> # closing tag
1390                         ').'
1391                                 )
1392                                 }xs';
1394                 while (1) {
1395                         #
1396                         # Each loop iteration seach for either the next tag, the next 
1397                         # openning code span marker, or the next escaped character. 
1398                         # Each token is then passed to handleSpanToken.
1399                         #
1400                         $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
1401                         
1402                         # Create token from text preceding tag.
1403                         if ($parts[0] != "") {
1404                                 $output .= $parts[0];
1405                         }
1406                         
1407                         # Check if we reach the end.
1408                         if (isset($parts[1])) {
1409                                 $output .= $this->handleSpanToken($parts[1], $parts[2]);
1410                                 $str = $parts[2];
1411                         }
1412                         else {
1413                                 break;
1414                         }
1415                 }
1416                 
1417                 return $output;
1418         }
1419         
1420         
1421         protected function handleSpanToken($token, &$str) {
1422         #
1423         # Handle $token provided by parseSpan by determining its nature and 
1424         # returning the corresponding value that should replace it.
1425         #
1426                 switch ($token{0}) {
1427                         case "\\":
1428                                 return $this->hashPart("&#". ord($token{1}). ";");
1429                         case "`":
1430                                 # Search for end marker in remaining text.
1431                                 if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm', 
1432                                         $str, $matches))
1433                                 {
1434                                         $str = $matches[2];
1435                                         $codespan = $this->makeCodeSpan($matches[1]);
1436                                         return $this->hashPart($codespan);
1437                                 }
1438                                 return $token; // return as text since no ending marker found.
1439                         default:
1440                                 return $this->hashPart($token);
1441                 }
1442         }
1445         protected function outdent($text) {
1446         #
1447         # Remove one level of line-leading tabs or spaces
1448         #
1449                 return preg_replace('/^(\t|[ ]{1,'.$this->tab_width.'})/m', '', $text);
1450         }
1453         # String length function for detab. `_initDetab` will create a function to 
1454         # hanlde UTF-8 if the default function does not exist.
1455         protected $utf8_strlen = 'mb_strlen';
1456         
1457         protected function detab($text) {
1458         #
1459         # Replace tabs with the appropriate amount of space.
1460         #
1461                 # For each line we separate the line in blocks delemited by
1462                 # tab characters. Then we reconstruct every line by adding the 
1463                 # appropriate number of space between each blocks.
1464                 
1465                 $text = preg_replace_callback('/^.*\t.*$/m',
1466                         array(&$this, '_detab_callback'), $text);
1468                 return $text;
1469         }
1470         protected function _detab_callback($matches) {
1471                 $line = $matches[0];
1472                 $strlen = $this->utf8_strlen; # strlen function for UTF-8.
1473                 
1474                 # Split in blocks.
1475                 $blocks = explode("\t", $line);
1476                 # Add each blocks to the line.
1477                 $line = $blocks[0];
1478                 unset($blocks[0]); # Do not add first block twice.
1479                 foreach ($blocks as $block) {
1480                         # Calculate amount of space, insert spaces, insert block.
1481                         $amount = $this->tab_width - 
1482                                 $strlen($line, 'UTF-8') % $this->tab_width;
1483                         $line .= str_repeat(" ", $amount) . $block;
1484                 }
1485                 return $line;
1486         }
1487         protected function _initDetab() {
1488         #
1489         # Check for the availability of the function in the `utf8_strlen` property
1490         # (initially `mb_strlen`). If the function is not available, create a 
1491         # function that will loosely count the number of UTF-8 characters with a
1492         # regular expression.
1493         #
1494                 if (function_exists($this->utf8_strlen)) return;
1495                 $this->utf8_strlen = create_function('$text', 'return preg_match_all(
1496                         "/[\\\\x00-\\\\xBF]|[\\\\xC0-\\\\xFF][\\\\x80-\\\\xBF]*/", 
1497                         $text, $m);');
1498         }
1501         protected function unhash($text) {
1502         #
1503         # Swap back in all the tags hashed by _HashHTMLBlocks.
1504         #
1505                 return preg_replace_callback('/(.)\x1A[0-9]+\1/', 
1506                         array(&$this, '_unhash_callback'), $text);
1507         }
1508         protected function _unhash_callback($matches) {
1509                 return $this->html_hashes[$matches[0]];
1510         }
1516 # Temporary Markdown Extra Parser Implementation Class
1518 # NOTE: DON'T USE THIS CLASS
1519 # Currently the implementation of of Extra resides here in this temporary class.
1520 # This makes it easier to propagate the changes between the three different
1521 # packaging styles of PHP Markdown. When this issue is resolved, this
1522 # MarkdownExtra_TmpImpl class here will disappear and \Michelf\MarkdownExtra
1523 # will contain the code. So please use \Michelf\MarkdownExtra and ignore this
1524 # one.
1527 class _MarkdownExtra_TmpImpl extends \Michelf\Markdown {
1529         ### Configuration Variables ###
1531         # Prefix for footnote ids.
1532         public $fn_id_prefix = "";
1533         
1534         # Optional title attribute for footnote links and backlinks.
1535         public $fn_link_title = "";
1536         public $fn_backlink_title = "";
1537         
1538         # Optional class attribute for footnote links and backlinks.
1539         public $fn_link_class = "footnote-ref";
1540         public $fn_backlink_class = "footnote-backref";
1542         # Class name for table cell alignment (%% replaced left/center/right)
1543         # For instance: 'go-%%' becomes 'go-left' or 'go-right' or 'go-center'
1544         # If empty, the align attribute is used instead of a class name.
1545         public $table_align_class_tmpl = '';
1547         # Optional class prefix for fenced code block.
1548         public $code_class_prefix = "";
1549         # Class attribute for code blocks goes on the `code` tag;
1550         # setting this to true will put attributes on the `pre` tag instead.
1551         public $code_attr_on_pre = false;
1552         
1553         # Predefined abbreviations.
1554         public $predef_abbr = array();
1557         ### Parser Implementation ###
1559         public function __construct() {
1560         #
1561         # Constructor function. Initialize the parser object.
1562         #
1563                 # Add extra escapable characters before parent constructor 
1564                 # initialize the table.
1565                 $this->escape_chars .= ':|';
1566                 
1567                 # Insert extra document, block, and span transformations. 
1568                 # Parent constructor will do the sorting.
1569                 $this->document_gamut += array(
1570                         "doFencedCodeBlocks" => 5,
1571                         "stripFootnotes"     => 15,
1572                         "stripAbbreviations" => 25,
1573                         "appendFootnotes"    => 50,
1574                         );
1575                 $this->block_gamut += array(
1576                         "doFencedCodeBlocks" => 5,
1577                         "doTables"           => 15,
1578                         "doDefLists"         => 45,
1579                         );
1580                 $this->span_gamut += array(
1581                         "doFootnotes"        => 5,
1582                         "doAbbreviations"    => 70,
1583                         );
1584                 
1585                 parent::__construct();
1586         }
1587         
1588         
1589         # Extra variables used during extra transformations.
1590         protected $footnotes = array();
1591         protected $footnotes_ordered = array();
1592         protected $footnotes_ref_count = array();
1593         protected $footnotes_numbers = array();
1594         protected $abbr_desciptions = array();
1595         protected $abbr_word_re = '';
1596         
1597         # Give the current footnote number.
1598         protected $footnote_counter = 1;
1599         
1600         
1601         protected function setup() {
1602         #
1603         # Setting up Extra-specific variables.
1604         #
1605                 parent::setup();
1606                 
1607                 $this->footnotes = array();
1608                 $this->footnotes_ordered = array();
1609                 $this->footnotes_ref_count = array();
1610                 $this->footnotes_numbers = array();
1611                 $this->abbr_desciptions = array();
1612                 $this->abbr_word_re = '';
1613                 $this->footnote_counter = 1;
1614                 
1615                 foreach ($this->predef_abbr as $abbr_word => $abbr_desc) {
1616                         if ($this->abbr_word_re)
1617                                 $this->abbr_word_re .= '|';
1618                         $this->abbr_word_re .= preg_quote($abbr_word);
1619                         $this->abbr_desciptions[$abbr_word] = trim($abbr_desc);
1620                 }
1621         }
1622         
1623         protected function teardown() {
1624         #
1625         # Clearing Extra-specific variables.
1626         #
1627                 $this->footnotes = array();
1628                 $this->footnotes_ordered = array();
1629                 $this->footnotes_ref_count = array();
1630                 $this->footnotes_numbers = array();
1631                 $this->abbr_desciptions = array();
1632                 $this->abbr_word_re = '';
1633                 
1634                 parent::teardown();
1635         }
1636         
1637         
1638         ### Extra Attribute Parser ###
1640         # Expression to use to catch attributes (includes the braces)
1641         protected $id_class_attr_catch_re = '\{((?:[ ]*[#.][-_:a-zA-Z0-9]+){1,})[ ]*\}';
1642         # Expression to use when parsing in a context when no capture is desired
1643         protected $id_class_attr_nocatch_re = '\{(?:[ ]*[#.][-_:a-zA-Z0-9]+){1,}[ ]*\}';
1645         protected function doExtraAttributes($tag_name, $attr) {
1646         #
1647         # Parse attributes caught by the $this->id_class_attr_catch_re expression
1648         # and return the HTML-formatted list of attributes.
1649         #
1650         # Currently supported attributes are .class and #id.
1651         #
1652                 if (empty($attr)) return "";
1653                 
1654                 # Split on components
1655                 preg_match_all('/[#.][-_:a-zA-Z0-9]+/', $attr, $matches);
1656                 $elements = $matches[0];
1658                 # handle classes and ids (only first id taken into account)
1659                 $classes = array();
1660                 $id = false;
1661                 foreach ($elements as $element) {
1662                         if ($element{0} == '.') {
1663                                 $classes[] = substr($element, 1);
1664                         } else if ($element{0} == '#') {
1665                                 if ($id === false) $id = substr($element, 1);
1666                         }
1667                 }
1669                 # compose attributes as string
1670                 $attr_str = "";
1671                 if (!empty($id)) {
1672                         $attr_str .= ' id="'.$id.'"';
1673                 }
1674                 if (!empty($classes)) {
1675                         $attr_str .= ' class="'.implode(" ", $classes).'"';
1676                 }
1677                 return $attr_str;
1678         }
1681         protected function stripLinkDefinitions($text) {
1682         #
1683         # Strips link definitions from text, stores the URLs and titles in
1684         # hash references.
1685         #
1686                 $less_than_tab = $this->tab_width - 1;
1688                 # Link defs are in the form: ^[id]: url "optional title"
1689                 $text = preg_replace_callback('{
1690                                                         ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
1691                                                           [ ]*
1692                                                           \n?                           # maybe *one* newline
1693                                                           [ ]*
1694                                                         (?:
1695                                                           <(.+?)>                       # url = $2
1696                                                         |
1697                                                           (\S+?)                        # url = $3
1698                                                         )
1699                                                           [ ]*
1700                                                           \n?                           # maybe one newline
1701                                                           [ ]*
1702                                                         (?:
1703                                                                 (?<=\s)                 # lookbehind for whitespace
1704                                                                 ["(]
1705                                                                 (.*?)                   # title = $4
1706                                                                 [")]
1707                                                                 [ ]*
1708                                                         )?      # title is optional
1709                                         (?:[ ]* '.$this->id_class_attr_catch_re.' )?  # $5 = extra id & class attr
1710                                                         (?:\n+|\Z)
1711                         }xm',
1712                         array(&$this, '_stripLinkDefinitions_callback'),
1713                         $text);
1714                 return $text;
1715         }
1716         protected function _stripLinkDefinitions_callback($matches) {
1717                 $link_id = strtolower($matches[1]);
1718                 $url = $matches[2] == '' ? $matches[3] : $matches[2];
1719                 $this->urls[$link_id] = $url;
1720                 $this->titles[$link_id] =& $matches[4];
1721                 $this->ref_attr[$link_id] = $this->doExtraAttributes("", $dummy =& $matches[5]);
1722                 return ''; # String that will replace the block
1723         }
1726         ### HTML Block Parser ###
1727         
1728         # Tags that are always treated as block tags:
1729         protected $block_tags_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|form|fieldset|iframe|hr|legend|article|section|nav|aside|hgroup|header|footer|figcaption';
1730                                                    
1731         # Tags treated as block tags only if the opening tag is alone on its line:
1732         protected $context_block_tags_re = 'script|noscript|ins|del|iframe|object|source|track|param|math|svg|canvas|audio|video';
1733         
1734         # Tags where markdown="1" default to span mode:
1735         protected $contain_span_tags_re = 'p|h[1-6]|li|dd|dt|td|th|legend|address';
1736         
1737         # Tags which must not have their contents modified, no matter where 
1738         # they appear:
1739         protected $clean_tags_re = 'script|math|svg';
1740         
1741         # Tags that do not need to be closed.
1742         protected $auto_close_tags_re = 'hr|img|param|source|track';
1743         
1745         protected function hashHTMLBlocks($text) {
1746         #
1747         # Hashify HTML Blocks and "clean tags".
1748         #
1749         # We only want to do this for block-level HTML tags, such as headers,
1750         # lists, and tables. That's because we still want to wrap <p>s around
1751         # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
1752         # phrase emphasis, and spans. The list of tags we're looking for is
1753         # hard-coded.
1754         #
1755         # This works by calling _HashHTMLBlocks_InMarkdown, which then calls
1756         # _HashHTMLBlocks_InHTML when it encounter block tags. When the markdown="1" 
1757         # attribute is found within a tag, _HashHTMLBlocks_InHTML calls back
1758         #  _HashHTMLBlocks_InMarkdown to handle the Markdown syntax within the tag.
1759         # These two functions are calling each other. It's recursive!
1760         #
1761                 if ($this->no_markup)  return $text;
1763                 #
1764                 # Call the HTML-in-Markdown hasher.
1765                 #
1766                 list($text, ) = $this->_hashHTMLBlocks_inMarkdown($text);
1767                 
1768                 return $text;
1769         }
1770         protected function _hashHTMLBlocks_inMarkdown($text, $indent = 0,
1771                                                                                 $enclosing_tag_re = '', $span = false)
1772         {
1773         #
1774         # Parse markdown text, calling _HashHTMLBlocks_InHTML for block tags.
1775         #
1776         # *   $indent is the number of space to be ignored when checking for code 
1777         #     blocks. This is important because if we don't take the indent into 
1778         #     account, something like this (which looks right) won't work as expected:
1779         #
1780         #     <div>
1781         #         <div markdown="1">
1782         #         Hello World.  <-- Is this a Markdown code block or text?
1783         #         </div>  <-- Is this a Markdown code block or a real tag?
1784         #     <div>
1785         #
1786         #     If you don't like this, just don't indent the tag on which
1787         #     you apply the markdown="1" attribute.
1788         #
1789         # *   If $enclosing_tag_re is not empty, stops at the first unmatched closing 
1790         #     tag with that name. Nested tags supported.
1791         #
1792         # *   If $span is true, text inside must treated as span. So any double 
1793         #     newline will be replaced by a single newline so that it does not create 
1794         #     paragraphs.
1795         #
1796         # Returns an array of that form: ( processed text , remaining text )
1797         #
1798                 if ($text === '') return array('', '');
1800                 # Regex to check for the presense of newlines around a block tag.
1801                 $newline_before_re = '/(?:^\n?|\n\n)*$/';
1802                 $newline_after_re = 
1803                         '{
1804                                 ^                                               # Start of text following the tag.
1805                                 (?>[ ]*<!--.*?-->)?             # Optional comment.
1806                                 [ ]*\n                                  # Must be followed by newline.
1807                         }xs';
1808                 
1809                 # Regex to match any tag.
1810                 $block_tag_re =
1811                         '{
1812                                 (                                       # $2: Capture whole tag.
1813                                         </?                                     # Any opening or closing tag.
1814                                                 (?>                             # Tag name.
1815                                                         '.$this->block_tags_re.'                        |
1816                                                         '.$this->context_block_tags_re.'        |
1817                                                         '.$this->clean_tags_re.'                |
1818                                                         (?!\s)'.$enclosing_tag_re.'
1819                                                 )
1820                                                 (?:
1821                                                         (?=[\s"\'/a-zA-Z0-9])   # Allowed characters after tag name.
1822                                                         (?>
1823                                                                 ".*?"           |       # Double quotes (can contain `>`)
1824                                                                 \'.*?\'         |       # Single quotes (can contain `>`)
1825                                                                 .+?                             # Anything but quotes and `>`.
1826                                                         )*?
1827                                                 )?
1828                                         >                                       # End of tag.
1829                                 |
1830                                         <!--    .*?     -->     # HTML Comment
1831                                 |
1832                                         <\?.*?\?> | <%.*?%>     # Processing instruction
1833                                 |
1834                                         <!\[CDATA\[.*?\]\]>     # CData Block
1835                                 |
1836                                         # Code span marker
1837                                         `+
1838                                 '. ( !$span ? ' # If not in span.
1839                                 |
1840                                         # Indented code block
1841                                         (?: ^[ ]*\n | ^ | \n[ ]*\n )
1842                                         [ ]{'.($indent+4).'}[^\n]* \n
1843                                         (?>
1844                                                 (?: [ ]{'.($indent+4).'}[^\n]* | [ ]* ) \n
1845                                         )*
1846                                 |
1847                                         # Fenced code block marker
1848                                         (?<= ^ | \n )
1849                                         [ ]{0,'.($indent+3).'}~{3,}
1850                                                                         [ ]*
1851                                         (?:
1852                                         \.?[-_:a-zA-Z0-9]+ # standalone class name
1853                                         |
1854                                                 '.$this->id_class_attr_nocatch_re.' # extra attributes
1855                                         )?
1856                                         [ ]*
1857                                         \n
1858                                 ' : '' ). ' # End (if not is span).
1859                                 )
1860                         }xs';
1862                 
1863                 $depth = 0;             # Current depth inside the tag tree.
1864                 $parsed = "";   # Parsed text that will be returned.
1866                 #
1867                 # Loop through every tag until we find the closing tag of the parent
1868                 # or loop until reaching the end of text if no parent tag specified.
1869                 #
1870                 do {
1871                         #
1872                         # Split the text using the first $tag_match pattern found.
1873                         # Text before  pattern will be first in the array, text after
1874                         # pattern will be at the end, and between will be any catches made 
1875                         # by the pattern.
1876                         #
1877                         $parts = preg_split($block_tag_re, $text, 2, 
1878                                                                 PREG_SPLIT_DELIM_CAPTURE);
1879                         
1880                         # If in Markdown span mode, add a empty-string span-level hash 
1881                         # after each newline to prevent triggering any block element.
1882                         if ($span) {
1883                                 $void = $this->hashPart("", ':');
1884                                 $newline = "$void\n";
1885                                 $parts[0] = $void . str_replace("\n", $newline, $parts[0]) . $void;
1886                         }
1887                         
1888                         $parsed .= $parts[0]; # Text before current tag.
1889                         
1890                         # If end of $text has been reached. Stop loop.
1891                         if (count($parts) < 3) {
1892                                 $text = "";
1893                                 break;
1894                         }
1895                         
1896                         $tag  = $parts[1]; # Tag to handle.
1897                         $text = $parts[2]; # Remaining text after current tag.
1898                         $tag_re = preg_quote($tag); # For use in a regular expression.
1899                         
1900                         #
1901                         # Check for: Code span marker
1902                         #
1903                         if ($tag{0} == "`") {
1904                                 # Find corresponding end marker.
1905                                 $tag_re = preg_quote($tag);
1906                                 if (preg_match('{^(?>.+?|\n(?!\n))*?(?<!`)'.$tag_re.'(?!`)}',
1907                                         $text, $matches))
1908                                 {
1909                                         # End marker found: pass text unchanged until marker.
1910                                         $parsed .= $tag . $matches[0];
1911                                         $text = substr($text, strlen($matches[0]));
1912                                 }
1913                                 else {
1914                                         # Unmatched marker: just skip it.
1915                                         $parsed .= $tag;
1916                                 }
1917                         }
1918                         #
1919                         # Check for: Fenced code block marker.
1920                         #
1921                         else if (preg_match('{^\n?([ ]{0,'.($indent+3).'})(~+)}', $tag, $capture)) {
1922                                 # Fenced code block marker: find matching end marker.
1923                                 $fence_indent = strlen($capture[1]); # use captured indent in re
1924                                 $fence_re = $capture[2]; # use captured fence in re
1925                                 if (preg_match('{^(?>.*\n)*?[ ]{'.($fence_indent).'}'.$fence_re.'[ ]*(?:\n|$)}', $text,
1926                                         $matches)) 
1927                                 {
1928                                         # End marker found: pass text unchanged until marker.
1929                                         $parsed .= $tag . $matches[0];
1930                                         $text = substr($text, strlen($matches[0]));
1931                                 }
1932                                 else {
1933                                         # No end marker: just skip it.
1934                                         $parsed .= $tag;
1935                                 }
1936                         }
1937                         #
1938                         # Check for: Indented code block.
1939                         #
1940                         else if ($tag{0} == "\n" || $tag{0} == " ") {
1941                                 # Indented code block: pass it unchanged, will be handled 
1942                                 # later.
1943                                 $parsed .= $tag;
1944                         }
1945                         #
1946                         # Check for: Opening Block level tag or
1947                         #            Opening Context Block tag (like ins and del) 
1948                         #               used as a block tag (tag is alone on it's line).
1949                         #
1950                         else if (preg_match('{^<(?:'.$this->block_tags_re.')\b}', $tag) ||
1951                                 (       preg_match('{^<(?:'.$this->context_block_tags_re.')\b}', $tag) &&
1952                                         preg_match($newline_before_re, $parsed) &&
1953                                         preg_match($newline_after_re, $text)    )
1954                                 )
1955                         {
1956                                 # Need to parse tag and following text using the HTML parser.
1957                                 list($block_text, $text) = 
1958                                         $this->_hashHTMLBlocks_inHTML($tag . $text, "hashBlock", true);
1959                                 
1960                                 # Make sure it stays outside of any paragraph by adding newlines.
1961                                 $parsed .= "\n\n$block_text\n\n";
1962                         }
1963                         #
1964                         # Check for: Clean tag (like script, math)
1965                         #            HTML Comments, processing instructions.
1966                         #
1967                         else if (preg_match('{^<(?:'.$this->clean_tags_re.')\b}', $tag) ||
1968                                 $tag{1} == '!' || $tag{1} == '?')
1969                         {
1970                                 # Need to parse tag and following text using the HTML parser.
1971                                 # (don't check for markdown attribute)
1972                                 list($block_text, $text) = 
1973                                         $this->_hashHTMLBlocks_inHTML($tag . $text, "hashClean", false);
1974                                 
1975                                 $parsed .= $block_text;
1976                         }
1977                         #
1978                         # Check for: Tag with same name as enclosing tag.
1979                         #
1980                         else if ($enclosing_tag_re !== '' &&
1981                                 # Same name as enclosing tag.
1982                                 preg_match('{^</?(?:'.$enclosing_tag_re.')\b}', $tag))
1983                         {
1984                                 #
1985                                 # Increase/decrease nested tag count.
1986                                 #
1987                                 if ($tag{1} == '/')                                             $depth--;
1988                                 else if ($tag{strlen($tag)-2} != '/')   $depth++;
1990                                 if ($depth < 0) {
1991                                         #
1992                                         # Going out of parent element. Clean up and break so we
1993                                         # return to the calling function.
1994                                         #
1995                                         $text = $tag . $text;
1996                                         break;
1997                                 }
1998                                 
1999                                 $parsed .= $tag;
2000                         }
2001                         else {
2002                                 $parsed .= $tag;
2003                         }
2004                 } while ($depth >= 0);
2005                 
2006                 return array($parsed, $text);
2007         }
2008         protected function _hashHTMLBlocks_inHTML($text, $hash_method, $md_attr) {
2009         #
2010         # Parse HTML, calling _HashHTMLBlocks_InMarkdown for block tags.
2011         #
2012         # *   Calls $hash_method to convert any blocks.
2013         # *   Stops when the first opening tag closes.
2014         # *   $md_attr indicate if the use of the `markdown="1"` attribute is allowed.
2015         #     (it is not inside clean tags)
2016         #
2017         # Returns an array of that form: ( processed text , remaining text )
2018         #
2019                 if ($text === '') return array('', '');
2020                 
2021                 # Regex to match `markdown` attribute inside of a tag.
2022                 $markdown_attr_re = '
2023                         {
2024                                 \s*                     # Eat whitespace before the `markdown` attribute
2025                                 markdown
2026                                 \s*=\s*
2027                                 (?>
2028                                         (["\'])         # $1: quote delimiter           
2029                                         (.*?)           # $2: attribute value
2030                                         \1                      # matching delimiter    
2031                                 |
2032                                         ([^\s>]*)       # $3: unquoted attribute value
2033                                 )
2034                                 ()                              # $4: make $3 always defined (avoid warnings)
2035                         }xs';
2036                 
2037                 # Regex to match any tag.
2038                 $tag_re = '{
2039                                 (                                       # $2: Capture whole tag.
2040                                         </?                                     # Any opening or closing tag.
2041                                                 [\w:$]+                 # Tag name.
2042                                                 (?:
2043                                                         (?=[\s"\'/a-zA-Z0-9])   # Allowed characters after tag name.
2044                                                         (?>
2045                                                                 ".*?"           |       # Double quotes (can contain `>`)
2046                                                                 \'.*?\'         |       # Single quotes (can contain `>`)
2047                                                                 .+?                             # Anything but quotes and `>`.
2048                                                         )*?
2049                                                 )?
2050                                         >                                       # End of tag.
2051                                 |
2052                                         <!--    .*?     -->     # HTML Comment
2053                                 |
2054                                         <\?.*?\?> | <%.*?%>     # Processing instruction
2055                                 |
2056                                         <!\[CDATA\[.*?\]\]>     # CData Block
2057                                 )
2058                         }xs';
2059                 
2060                 $original_text = $text;         # Save original text in case of faliure.
2061                 
2062                 $depth          = 0;    # Current depth inside the tag tree.
2063                 $block_text     = "";   # Temporary text holder for current text.
2064                 $parsed         = "";   # Parsed text that will be returned.
2066                 #
2067                 # Get the name of the starting tag.
2068                 # (This pattern makes $base_tag_name_re safe without quoting.)
2069                 #
2070                 if (preg_match('/^<([\w:$]*)\b/', $text, $matches))
2071                         $base_tag_name_re = $matches[1];
2073                 #
2074                 # Loop through every tag until we find the corresponding closing tag.
2075                 #
2076                 do {
2077                         #
2078                         # Split the text using the first $tag_match pattern found.
2079                         # Text before  pattern will be first in the array, text after
2080                         # pattern will be at the end, and between will be any catches made 
2081                         # by the pattern.
2082                         #
2083                         $parts = preg_split($tag_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
2084                         
2085                         if (count($parts) < 3) {
2086                                 #
2087                                 # End of $text reached with unbalenced tag(s).
2088                                 # In that case, we return original text unchanged and pass the
2089                                 # first character as filtered to prevent an infinite loop in the 
2090                                 # parent function.
2091                                 #
2092                                 return array($original_text{0}, substr($original_text, 1));
2093                         }
2094                         
2095                         $block_text .= $parts[0]; # Text before current tag.
2096                         $tag         = $parts[1]; # Tag to handle.
2097                         $text        = $parts[2]; # Remaining text after current tag.
2098                         
2099                         #
2100                         # Check for: Auto-close tag (like <hr/>)
2101                         #                        Comments and Processing Instructions.
2102                         #
2103                         if (preg_match('{^</?(?:'.$this->auto_close_tags_re.')\b}', $tag) ||
2104                                 $tag{1} == '!' || $tag{1} == '?')
2105                         {
2106                                 # Just add the tag to the block as if it was text.
2107                                 $block_text .= $tag;
2108                         }
2109                         else {
2110                                 #
2111                                 # Increase/decrease nested tag count. Only do so if
2112                                 # the tag's name match base tag's.
2113                                 #
2114                                 if (preg_match('{^</?'.$base_tag_name_re.'\b}', $tag)) {
2115                                         if ($tag{1} == '/')                                             $depth--;
2116                                         else if ($tag{strlen($tag)-2} != '/')   $depth++;
2117                                 }
2118                                 
2119                                 #
2120                                 # Check for `markdown="1"` attribute and handle it.
2121                                 #
2122                                 if ($md_attr && 
2123                                         preg_match($markdown_attr_re, $tag, $attr_m) &&
2124                                         preg_match('/^1|block|span$/', $attr_m[2] . $attr_m[3]))
2125                                 {
2126                                         # Remove `markdown` attribute from opening tag.
2127                                         $tag = preg_replace($markdown_attr_re, '', $tag);
2128                                         
2129                                         # Check if text inside this tag must be parsed in span mode.
2130                                         $this->mode = $attr_m[2] . $attr_m[3];
2131                                         $span_mode = $this->mode == 'span' || $this->mode != 'block' &&
2132                                                 preg_match('{^<(?:'.$this->contain_span_tags_re.')\b}', $tag);
2133                                         
2134                                         # Calculate indent before tag.
2135                                         if (preg_match('/(?:^|\n)( *?)(?! ).*?$/', $block_text, $matches)) {
2136                                                 $strlen = $this->utf8_strlen;
2137                                                 $indent = $strlen($matches[1], 'UTF-8');
2138                                         } else {
2139                                                 $indent = 0;
2140                                         }
2141                                         
2142                                         # End preceding block with this tag.
2143                                         $block_text .= $tag;
2144                                         $parsed .= $this->$hash_method($block_text);
2145                                         
2146                                         # Get enclosing tag name for the ParseMarkdown function.
2147                                         # (This pattern makes $tag_name_re safe without quoting.)
2148                                         preg_match('/^<([\w:$]*)\b/', $tag, $matches);
2149                                         $tag_name_re = $matches[1];
2150                                         
2151                                         # Parse the content using the HTML-in-Markdown parser.
2152                                         list ($block_text, $text)
2153                                                 = $this->_hashHTMLBlocks_inMarkdown($text, $indent, 
2154                                                         $tag_name_re, $span_mode);
2155                                         
2156                                         # Outdent markdown text.
2157                                         if ($indent > 0) {
2158                                                 $block_text = preg_replace("/^[ ]{1,$indent}/m", "", 
2159                                                                                                         $block_text);
2160                                         }
2161                                         
2162                                         # Append tag content to parsed text.
2163                                         if (!$span_mode)        $parsed .= "\n\n$block_text\n\n";
2164                                         else                            $parsed .= "$block_text";
2165                                         
2166                                         # Start over with a new block.
2167                                         $block_text = "";
2168                                 }
2169                                 else $block_text .= $tag;
2170                         }
2171                         
2172                 } while ($depth > 0);
2173                 
2174                 #
2175                 # Hash last block text that wasn't processed inside the loop.
2176                 #
2177                 $parsed .= $this->$hash_method($block_text);
2178                 
2179                 return array($parsed, $text);
2180         }
2183         protected function hashClean($text) {
2184         #
2185         # Called whenever a tag must be hashed when a function inserts a "clean" tag
2186         # in $text, it passes through this function and is automaticaly escaped, 
2187         # blocking invalid nested overlap.
2188         #
2189                 return $this->hashPart($text, 'C');
2190         }
2193         protected function doAnchors($text) {
2194         #
2195         # Turn Markdown link shortcuts into XHTML <a> tags.
2196         #
2197                 if ($this->in_anchor) return $text;
2198                 $this->in_anchor = true;
2199                 
2200                 #
2201                 # First, handle reference-style links: [link text] [id]
2202                 #
2203                 $text = preg_replace_callback('{
2204                         (                                       # wrap whole match in $1
2205                           \[
2206                                 ('.$this->nested_brackets_re.') # link text = $2
2207                           \]
2209                           [ ]?                          # one optional space
2210                           (?:\n[ ]*)?           # one optional newline followed by spaces
2212                           \[
2213                                 (.*?)           # id = $3
2214                           \]
2215                         )
2216                         }xs',
2217                         array(&$this, '_doAnchors_reference_callback'), $text);
2219                 #
2220                 # Next, inline-style links: [link text](url "optional title")
2221                 #
2222                 $text = preg_replace_callback('{
2223                         (                               # wrap whole match in $1
2224                           \[
2225                                 ('.$this->nested_brackets_re.') # link text = $2
2226                           \]
2227                           \(                    # literal paren
2228                                 [ \n]*
2229                                 (?:
2230                                         <(.+?)> # href = $3
2231                                 |
2232                                         ('.$this->nested_url_parenthesis_re.')  # href = $4
2233                                 )
2234                                 [ \n]*
2235                                 (                       # $5
2236                                   ([\'"])       # quote char = $6
2237                                   (.*?)         # Title = $7
2238                                   \6            # matching quote
2239                                   [ \n]*        # ignore any spaces/tabs between closing quote and )
2240                                 )?                      # title is optional
2241                           \)
2242                           (?:[ ]? '.$this->id_class_attr_catch_re.' )?   # $8 = id/class attributes
2243                         )
2244                         }xs',
2245                         array(&$this, '_doAnchors_inline_callback'), $text);
2247                 #
2248                 # Last, handle reference-style shortcuts: [link text]
2249                 # These must come last in case you've also got [link text][1]
2250                 # or [link text](/foo)
2251                 #
2252                 $text = preg_replace_callback('{
2253                         (                                       # wrap whole match in $1
2254                           \[
2255                                 ([^\[\]]+)              # link text = $2; can\'t contain [ or ]
2256                           \]
2257                         )
2258                         }xs',
2259                         array(&$this, '_doAnchors_reference_callback'), $text);
2261                 $this->in_anchor = false;
2262                 return $text;
2263         }
2264         protected function _doAnchors_reference_callback($matches) {
2265                 $whole_match =  $matches[1];
2266                 $link_text   =  $matches[2];
2267                 $link_id     =& $matches[3];
2269                 if ($link_id == "") {
2270                         # for shortcut links like [this][] or [this].
2271                         $link_id = $link_text;
2272                 }
2273                 
2274                 # lower-case and turn embedded newlines into spaces
2275                 $link_id = strtolower($link_id);
2276                 $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
2278                 if (isset($this->urls[$link_id])) {
2279                         $url = $this->urls[$link_id];
2280                         $url = $this->encodeAttribute($url);
2281                         
2282                         $result = "<a href=\"$url\"";
2283                         if ( isset( $this->titles[$link_id] ) ) {
2284                                 $title = $this->titles[$link_id];
2285                                 $title = $this->encodeAttribute($title);
2286                                 $result .=  " title=\"$title\"";
2287                         }
2288                         if (isset($this->ref_attr[$link_id]))
2289                                 $result .= $this->ref_attr[$link_id];
2290                 
2291                         $link_text = $this->runSpanGamut($link_text);
2292                         $result .= ">$link_text</a>";
2293                         $result = $this->hashPart($result);
2294                 }
2295                 else {
2296                         $result = $whole_match;
2297                 }
2298                 return $result;
2299         }
2300         protected function _doAnchors_inline_callback($matches) {
2301                 $whole_match    =  $matches[1];
2302                 $link_text              =  $this->runSpanGamut($matches[2]);
2303                 $url                    =  $matches[3] == '' ? $matches[4] : $matches[3];
2304                 $title                  =& $matches[7];
2305                 $attr  = $this->doExtraAttributes("a", $dummy =& $matches[8]);
2308                 $url = $this->encodeAttribute($url);
2310                 $result = "<a href=\"$url\"";
2311                 if (isset($title)) {
2312                         $title = $this->encodeAttribute($title);
2313                         $result .=  " title=\"$title\"";
2314                 }
2315                 $result .= $attr;
2316                 
2317                 $link_text = $this->runSpanGamut($link_text);
2318                 $result .= ">$link_text</a>";
2320                 return $this->hashPart($result);
2321         }
2324         protected function doImages($text) {
2325         #
2326         # Turn Markdown image shortcuts into <img> tags.
2327         #
2328                 #
2329                 # First, handle reference-style labeled images: ![alt text][id]
2330                 #
2331                 $text = preg_replace_callback('{
2332                         (                               # wrap whole match in $1
2333                           !\[
2334                                 ('.$this->nested_brackets_re.')         # alt text = $2
2335                           \]
2337                           [ ]?                          # one optional space
2338                           (?:\n[ ]*)?           # one optional newline followed by spaces
2340                           \[
2341                                 (.*?)           # id = $3
2342                           \]
2344                         )
2345                         }xs', 
2346                         array(&$this, '_doImages_reference_callback'), $text);
2348                 #
2349                 # Next, handle inline images:  ![alt text](url "optional title")
2350                 # Don't forget: encode * and _
2351                 #
2352                 $text = preg_replace_callback('{
2353                         (                               # wrap whole match in $1
2354                           !\[
2355                                 ('.$this->nested_brackets_re.')         # alt text = $2
2356                           \]
2357                           \s?                   # One optional whitespace character
2358                           \(                    # literal paren
2359                                 [ \n]*
2360                                 (?:
2361                                         <(\S*)> # src url = $3
2362                                 |
2363                                         ('.$this->nested_url_parenthesis_re.')  # src url = $4
2364                                 )
2365                                 [ \n]*
2366                                 (                       # $5
2367                                   ([\'"])       # quote char = $6
2368                                   (.*?)         # title = $7
2369                                   \6            # matching quote
2370                                   [ \n]*
2371                                 )?                      # title is optional
2372                           \)
2373                           (?:[ ]? '.$this->id_class_attr_catch_re.' )?   # $8 = id/class attributes
2374                         )
2375                         }xs',
2376                         array(&$this, '_doImages_inline_callback'), $text);
2378                 return $text;
2379         }
2380         protected function _doImages_reference_callback($matches) {
2381                 $whole_match = $matches[1];
2382                 $alt_text    = $matches[2];
2383                 $link_id     = strtolower($matches[3]);
2385                 if ($link_id == "") {
2386                         $link_id = strtolower($alt_text); # for shortcut links like ![this][].
2387                 }
2389                 $alt_text = $this->encodeAttribute($alt_text);
2390                 if (isset($this->urls[$link_id])) {
2391                         $url = $this->encodeAttribute($this->urls[$link_id]);
2392                         $result = "<img src=\"$url\" alt=\"$alt_text\"";
2393                         if (isset($this->titles[$link_id])) {
2394                                 $title = $this->titles[$link_id];
2395                                 $title = $this->encodeAttribute($title);
2396                                 $result .=  " title=\"$title\"";
2397                         }
2398                         if (isset($this->ref_attr[$link_id]))
2399                                 $result .= $this->ref_attr[$link_id];
2400                         $result .= $this->empty_element_suffix;
2401                         $result = $this->hashPart($result);
2402                 }
2403                 else {
2404                         # If there's no such link ID, leave intact:
2405                         $result = $whole_match;
2406                 }
2408                 return $result;
2409         }
2410         protected function _doImages_inline_callback($matches) {
2411                 $whole_match    = $matches[1];
2412                 $alt_text               = $matches[2];
2413                 $url                    = $matches[3] == '' ? $matches[4] : $matches[3];
2414                 $title                  =& $matches[7];
2415                 $attr  = $this->doExtraAttributes("img", $dummy =& $matches[8]);
2417                 $alt_text = $this->encodeAttribute($alt_text);
2418                 $url = $this->encodeAttribute($url);
2419                 $result = "<img src=\"$url\" alt=\"$alt_text\"";
2420                 if (isset($title)) {
2421                         $title = $this->encodeAttribute($title);
2422                         $result .=  " title=\"$title\""; # $title already quoted
2423                 }
2424                 $result .= $attr;
2425                 $result .= $this->empty_element_suffix;
2427                 return $this->hashPart($result);
2428         }
2431         protected function doHeaders($text) {
2432         #
2433         # Redefined to add id and class attribute support.
2434         #
2435                 # Setext-style headers:
2436                 #         Header 1  {#header1}
2437                 #         ========
2438                 #  
2439                 #         Header 2  {#header2 .class1 .class2}
2440                 #         --------
2441                 #
2442                 $text = preg_replace_callback(
2443                         '{
2444                                 (^.+?)                                                          # $1: Header text
2445                                 (?:[ ]+ '.$this->id_class_attr_catch_re.' )?     # $3 = id/class attributes
2446                                 [ ]*\n(=+|-+)[ ]*\n+                            # $3: Header footer
2447                         }mx',
2448                         array(&$this, '_doHeaders_callback_setext'), $text);
2450                 # atx-style headers:
2451                 #       # Header 1        {#header1}
2452                 #       ## Header 2       {#header2}
2453                 #       ## Header 2 with closing hashes ##  {#header3.class1.class2}
2454                 #       ...
2455                 #       ###### Header 6   {.class2}
2456                 #
2457                 $text = preg_replace_callback('{
2458                                 ^(\#{1,6})      # $1 = string of #\'s
2459                                 [ ]*
2460                                 (.+?)           # $2 = Header text
2461                                 [ ]*
2462                                 \#*                     # optional closing #\'s (not counted)
2463                                 (?:[ ]+ '.$this->id_class_attr_catch_re.' )?     # $3 = id/class attributes
2464                                 [ ]*
2465                                 \n+
2466                         }xm',
2467                         array(&$this, '_doHeaders_callback_atx'), $text);
2469                 return $text;
2470         }
2471         protected function _doHeaders_callback_setext($matches) {
2472                 if ($matches[3] == '-' && preg_match('{^- }', $matches[1]))
2473                         return $matches[0];
2474                 $level = $matches[3]{0} == '=' ? 1 : 2;
2475                 $attr  = $this->doExtraAttributes("h$level", $dummy =& $matches[2]);
2476                 $block = "<h$level$attr>".$this->runSpanGamut($matches[1])."</h$level>";
2477                 return "\n" . $this->hashBlock($block) . "\n\n";
2478         }
2479         protected function _doHeaders_callback_atx($matches) {
2480                 $level = strlen($matches[1]);
2481                 $attr  = $this->doExtraAttributes("h$level", $dummy =& $matches[3]);
2482                 $block = "<h$level$attr>".$this->runSpanGamut($matches[2])."</h$level>";
2483                 return "\n" . $this->hashBlock($block) . "\n\n";
2484         }
2487         protected function doTables($text) {
2488         #
2489         # Form HTML tables.
2490         #
2491                 $less_than_tab = $this->tab_width - 1;
2492                 #
2493                 # Find tables with leading pipe.
2494                 #
2495                 #       | Header 1 | Header 2
2496                 #       | -------- | --------
2497                 #       | Cell 1   | Cell 2
2498                 #       | Cell 3   | Cell 4
2499                 #
2500                 $text = preg_replace_callback('
2501                         {
2502                                 ^                                                       # Start of a line
2503                                 [ ]{0,'.$less_than_tab.'}       # Allowed whitespace.
2504                                 [|]                                                     # Optional leading pipe (present)
2505                                 (.+) \n                                         # $1: Header row (at least one pipe)
2506                                 
2507                                 [ ]{0,'.$less_than_tab.'}       # Allowed whitespace.
2508                                 [|] ([ ]*[-:]+[-| :]*) \n       # $2: Header underline
2509                                 
2510                                 (                                                       # $3: Cells
2511                                         (?>
2512                                                 [ ]*                            # Allowed whitespace.
2513                                                 [|] .* \n                       # Row content.
2514                                         )*
2515                                 )
2516                                 (?=\n|\Z)                                       # Stop at final double newline.
2517                         }xm',
2518                         array(&$this, '_doTable_leadingPipe_callback'), $text);
2519                 
2520                 #
2521                 # Find tables without leading pipe.
2522                 #
2523                 #       Header 1 | Header 2
2524                 #       -------- | --------
2525                 #       Cell 1   | Cell 2
2526                 #       Cell 3   | Cell 4
2527                 #
2528                 $text = preg_replace_callback('
2529                         {
2530                                 ^                                                       # Start of a line
2531                                 [ ]{0,'.$less_than_tab.'}       # Allowed whitespace.
2532                                 (\S.*[|].*) \n                          # $1: Header row (at least one pipe)
2533                                 
2534                                 [ ]{0,'.$less_than_tab.'}       # Allowed whitespace.
2535                                 ([-:]+[ ]*[|][-| :]*) \n        # $2: Header underline
2536                                 
2537                                 (                                                       # $3: Cells
2538                                         (?>
2539                                                 .* [|] .* \n            # Row content
2540                                         )*
2541                                 )
2542                                 (?=\n|\Z)                                       # Stop at final double newline.
2543                         }xm',
2544                         array(&$this, '_DoTable_callback'), $text);
2546                 return $text;
2547         }
2548         protected function _doTable_leadingPipe_callback($matches) {
2549                 $head           = $matches[1];
2550                 $underline      = $matches[2];
2551                 $content        = $matches[3];
2552                 
2553                 # Remove leading pipe for each row.
2554                 $content        = preg_replace('/^ *[|]/m', '', $content);
2555                 
2556                 return $this->_doTable_callback(array($matches[0], $head, $underline, $content));
2557         }
2558         protected function _doTable_makeAlignAttr($alignname)
2559         {
2560                 if (empty($this->table_align_class_tmpl))
2561                         return " align=\"$alignname\"";
2563                 $classname = str_replace('%%', $alignname, $this->table_align_class_tmpl);
2564                 return " class=\"$classname\"";
2565         }
2566         protected function _doTable_callback($matches) {
2567                 $head           = $matches[1];
2568                 $underline      = $matches[2];
2569                 $content        = $matches[3];
2571                 # Remove any tailing pipes for each line.
2572                 $head           = preg_replace('/[|] *$/m', '', $head);
2573                 $underline      = preg_replace('/[|] *$/m', '', $underline);
2574                 $content        = preg_replace('/[|] *$/m', '', $content);
2575                 
2576                 # Reading alignement from header underline.
2577                 $separators     = preg_split('/ *[|] */', $underline);
2578                 foreach ($separators as $n => $s) {
2579                         if (preg_match('/^ *-+: *$/', $s))
2580                                 $attr[$n] = $this->_doTable_makeAlignAttr('right');
2581                         else if (preg_match('/^ *:-+: *$/', $s))
2582                                 $attr[$n] = $this->_doTable_makeAlignAttr('center');
2583                         else if (preg_match('/^ *:-+ *$/', $s))
2584                                 $attr[$n] = $this->_doTable_makeAlignAttr('left');
2585                         else
2586                                 $attr[$n] = '';
2587                 }
2588                 
2589                 # Parsing span elements, including code spans, character escapes, 
2590                 # and inline HTML tags, so that pipes inside those gets ignored.
2591                 $head           = $this->parseSpan($head);
2592                 $headers        = preg_split('/ *[|] */', $head);
2593                 $col_count      = count($headers);
2594                 $attr       = array_pad($attr, $col_count, '');
2595                 
2596                 # Write column headers.
2597                 $text = "<table>\n";
2598                 $text .= "<thead>\n";
2599                 $text .= "<tr>\n";
2600                 foreach ($headers as $n => $header)
2601                         $text .= "  <th$attr[$n]>".$this->runSpanGamut(trim($header))."</th>\n";
2602                 $text .= "</tr>\n";
2603                 $text .= "</thead>\n";
2604                 
2605                 # Split content by row.
2606                 $rows = explode("\n", trim($content, "\n"));
2607                 
2608                 $text .= "<tbody>\n";
2609                 foreach ($rows as $row) {
2610                         # Parsing span elements, including code spans, character escapes, 
2611                         # and inline HTML tags, so that pipes inside those gets ignored.
2612                         $row = $this->parseSpan($row);
2613                         
2614                         # Split row by cell.
2615                         $row_cells = preg_split('/ *[|] */', $row, $col_count);
2616                         $row_cells = array_pad($row_cells, $col_count, '');
2617                         
2618                         $text .= "<tr>\n";
2619                         foreach ($row_cells as $n => $cell)
2620                                 $text .= "  <td$attr[$n]>".$this->runSpanGamut(trim($cell))."</td>\n";
2621                         $text .= "</tr>\n";
2622                 }
2623                 $text .= "</tbody>\n";
2624                 $text .= "</table>";
2625                 
2626                 return $this->hashBlock($text) . "\n";
2627         }
2629         
2630         protected function doDefLists($text) {
2631         #
2632         # Form HTML definition lists.
2633         #
2634                 $less_than_tab = $this->tab_width - 1;
2636                 # Re-usable pattern to match any entire dl list:
2637                 $whole_list_re = '(?>
2638                         (                                                               # $1 = whole list
2639                           (                                                             # $2
2640                                 [ ]{0,'.$less_than_tab.'}
2641                                 ((?>.*\S.*\n)+)                         # $3 = defined term
2642                                 \n?
2643                                 [ ]{0,'.$less_than_tab.'}:[ ]+ # colon starting definition
2644                           )
2645                           (?s:.+?)
2646                           (                                                             # $4
2647                                   \z
2648                                 |
2649                                   \n{2,}
2650                                   (?=\S)
2651                                   (?!                                           # Negative lookahead for another term
2652                                         [ ]{0,'.$less_than_tab.'}
2653                                         (?: \S.*\n )+?                  # defined term
2654                                         \n?
2655                                         [ ]{0,'.$less_than_tab.'}:[ ]+ # colon starting definition
2656                                   )
2657                                   (?!                                           # Negative lookahead for another definition
2658                                         [ ]{0,'.$less_than_tab.'}:[ ]+ # colon starting definition
2659                                   )