on-demand release 4.0dev+
[moodle.git] / filter / urltolink / filter.php
CommitLineData
062b4110
DM
1<?php
2
3// This file is part of Moodle - http://moodle.org/
4//
5// Moodle is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// Moodle is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
17
18/**
19 * Filter converting URLs in the text to HTML links
20 *
21 * @package filter
22 * @subpackage urltolink
23 * @copyright 2010 David Mudrak <david@moodle.com>
24 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25 */
26
27defined('MOODLE_INTERNAL') || die();
28
29class filter_urltolink extends moodle_text_filter {
30
31 /**
32 * @var array global configuration for this filter
33 *
34 * This might be eventually moved into parent class if we found it
35 * useful for other filters, too.
36 */
37 protected static $globalconfig;
38
39 /**
40 * Apply the filter to the text
41 *
42 * @see filter_manager::apply_filter_chain()
43 * @param string $text to be processed by the text
44 * @param array $options filter options
45 * @return string text after processing
46 */
47 public function filter($text, array $options = array()) {
48 if (!isset($options['originalformat'])) {
7f97ca73
DM
49 // if the format is not specified, we are probably called by {@see format_string()}
50 // in that case, it would be dangerous to replace URL with the link because it could
51 // be stripped. therefore, we do nothing
062b4110
DM
52 return $text;
53 }
0d6f53e8 54 if (in_array($options['originalformat'], explode(',', get_config('filter_urltolink', 'formats')))) {
062b4110
DM
55 $this->convert_urls_into_links($text);
56 }
57 return $text;
58 }
59
60 ////////////////////////////////////////////////////////////////////////////
61 // internal implementation starts here
62 ////////////////////////////////////////////////////////////////////////////
63
062b4110
DM
64 /**
65 * Given some text this function converts any URLs it finds into HTML links
66 *
67 * @param string $text Passed in by reference. The string to be searched for urls.
68 */
69 protected function convert_urls_into_links(&$text) {
70 //I've added img tags to this list of tags to ignore.
71 //See MDL-21168 for more info. A better way to ignore tags whether or not
72 //they are escaped partially or completely would be desirable. For example:
73 //<a href="blah">
74 //&lt;a href="blah"&gt;
75 //&lt;a href="blah">
2a477bf0
MS
76 $filterignoretagsopen = array('<a\s[^>]+?>', '<span[^>]+?class="nolink"[^>]*?>');
77 $filterignoretagsclose = array('</a>', '</span>');
78da366b 78 $ignoretags = [];
062b4110
DM
79 filter_save_ignore_tags($text,$filterignoretagsopen,$filterignoretagsclose,$ignoretags);
80
81 // Check if we support unicode modifiers in regular expressions. Cache it.
82 // TODO: this check should be a environment requirement in Moodle 2.0, as far as unicode
83 // chars are going to arrive to URLs officially really soon (2010?)
84 // Original RFC regex from: http://www.bytemycode.com/snippets/snippet/796/
85 // Various ideas from: http://alanstorm.com/url_regex_explained
86 // Unicode check, negative assertion and other bits from Moodle.
87 static $unicoderegexp;
88 if (!isset($unicoderegexp)) {
89 $unicoderegexp = @preg_match('/\pL/u', 'a'); // This will fail silently, returning false,
90 }
91
c2c6af94 92 // TODO MDL-21296 - use of unicode modifiers may cause a timeout
e93cdb69 93 $urlstart = '(?:http(s)?://|(?<!://)(www\.))';
c2c6af94
TH
94 $domainsegment = '(?:[\pLl0-9][\pLl0-9-]*[\pLl0-9]|[\pLl0-9])';
95 $numericip = '(?:(?:[0-9]{1,3}\.){3}[0-9]{1,3})';
96 $port = '(?::\d*)';
97 $pathchar = '(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})';
98 $path = "(?:/$pathchar*)*";
99 $querystring = '(?:\?(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)';
100 $fragment = '(?:\#(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)';
101
e1587879
FM
102 // Lookbehind assertions.
103 // Is not HTML attribute or CSS URL property. Unfortunately legit text like "url(http://...)" will not be a link.
e1587879
FM
104 $lookbehindend = "(?<![]),.;])";
105
a5b8265f 106 $regex = "$urlstart((?:$domainsegment\.)+$domainsegment|$numericip)" .
e1587879 107 "($port?$path$querystring?$fragment?)$lookbehindend";
c2c6af94
TH
108 if ($unicoderegexp) {
109 $regex = '#' . $regex . '#ui';
110 } else {
111 $regex = '#' . preg_replace(array('\pLl', '\PL'), 'a-z', $regex) . '#i';
062b4110
DM
112 }
113
a5b8265f 114 // Locate any HTML tags.
07323f50 115 $matches = preg_split('/(<[^<|>]*>)/i', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
a5b8265f
ZD
116
117 // Iterate through the tokenized text to handle chunks (html and content).
118 foreach ($matches as $idx => $chunk) {
119 // Nothing to do. We skip completely any html chunk.
07323f50 120 if (strpos(trim($chunk), '<') === 0) {
a5b8265f
ZD
121 continue;
122 }
123
124 // Nothing to do. We skip any content chunk having any of these attributes.
125 if (preg_match('#(background=")|(action=")|(style="background)|(href=")|(src=")|(url [(])#', $chunk)) {
126 continue;
127 }
128
129 // Arrived here, we want to process every word in this chunk.
130 $text = $chunk;
131 $words = explode(' ', $text);
132
133 foreach ($words as $idx2 => $word) {
134 // ReDoS protection. Stop processing if a word is too large.
135 if (strlen($word) < 4096) {
136 $words[$idx2] = preg_replace($regex, '<a href="http$1://$2$3$4" class="_blanktarget">$0</a>', $word);
137 }
138 }
139 $text = implode(' ', $words);
140
141 // Copy the result back to the array.
142 $matches[$idx] = $text;
143 }
144
145 $text = implode('', $matches);
c2c6af94 146
062b4110
DM
147 if (!empty($ignoretags)) {
148 $ignoretags = array_reverse($ignoretags); /// Reversed so "progressive" str_replace() will solve some nesting problems.
149 $text = str_replace(array_keys($ignoretags),$ignoretags,$text);
150 }
fcd2cbaf 151
0d6f53e8 152 if (get_config('filter_urltolink', 'embedimages')) {
fcd2cbaf
PS
153 // now try to inject the images, this code was originally in the mediapluing filter
154 // this may be useful only if somebody relies on the fact the links in FORMAT_MOODLE get converted
155 // to URLs which in turn change to real images
156 $search = '/<a href="([^"]+\.(jpg|png|gif))" class="_blanktarget">([^>]*)<\/a>/is';
157 $text = preg_replace_callback($search, 'filter_urltolink_img_callback', $text);
158 }
062b4110
DM
159 }
160}
fcd2cbaf
PS
161
162
163/**
164 * Change links to images into embedded images.
165 *
166 * This plugin is intended for automatic conversion of image URLs when FORMAT_MOODLE used.
167 *
168 * @param $link
169 * @return string
170 */
171function filter_urltolink_img_callback($link) {
172 if ($link[1] !== $link[3]) {
173 // this is not a link created by this filter, because the url does not match the text
174 return $link[0];
175 }
176 return '<img class="filter_urltolink_image" alt="" src="'.$link[1].'" />';
177}