MDL-47002 editor_atto: Remove all after html when pasting
[moodle.git] / lib / editor / atto / yui / src / editor / js / clean.js
CommitLineData
62467795
AN
1// This file is part of Moodle - http://moodle.org/
2//
3// Moodle is free software: you can redistribute it and/or modify
4// it under the terms of the GNU General Public License as published by
5// the Free Software Foundation, either version 3 of the License, or
6// (at your option) any later version.
7//
8// Moodle is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
15
16/**
17 * @module moodle-editor_atto-editor
18 * @submodule clean
19 */
20
d088a835 21/**
62467795
AN
22 * Functions for the Atto editor to clean the generated content.
23 *
24 * See {{#crossLink "M.editor_atto.Editor"}}{{/crossLink}} for details.
d088a835 25 *
62467795
AN
26 * @namespace M.editor_atto
27 * @class EditorClean
d088a835 28 */
d088a835 29
62467795
AN
30function EditorClean() {}
31
32EditorClean.ATTRS= {
33};
34
35EditorClean.prototype = {
36 /**
37 * Clean the generated HTML content without modifying the editor content.
38 *
39 * This includes removes all YUI ids from the generated content.
40 *
41 * @return {string} The cleaned HTML content.
42 */
43 getCleanHTML: function() {
44 // Clone the editor so that we don't actually modify the real content.
9389ff57
FM
45 var editorClone = this.editor.cloneNode(true),
46 html;
62467795
AN
47
48 // Remove all YUI IDs.
49 Y.each(editorClone.all('[id^="yui"]'), function(node) {
50 node.removeAttribute('id');
51 });
52
53 editorClone.all('.atto_control').remove(true);
9389ff57
FM
54 html = editorClone.get('innerHTML');
55
56 // Revert untouched editor contents to an empty string.
57 if (html === '<p></p>' || html === '<p><br></p>') {
58 return '';
59 }
62467795
AN
60
61 // Remove any and all nasties from source.
9389ff57 62 return this._cleanHTML(html);
62467795
AN
63 },
64
65 /**
66 * Clean the HTML content of the editor.
67 *
68 * @method cleanEditorHTML
69 * @chainable
70 */
71 cleanEditorHTML: function() {
72 var startValue = this.editor.get('innerHTML');
73 this.editor.set('innerHTML', this._cleanHTML(startValue));
74
75 return this;
76 },
77
78 /**
79 * Clean the specified HTML content and remove any content which could cause issues.
80 *
81 * @method _cleanHTML
82 * @private
83 * @param {String} content The content to clean
84 * @return {String} The cleaned HTML
85 */
86 _cleanHTML: function(content) {
3ef96361 87 // Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.
62467795
AN
88
89 var rules = [
3ef96361
EM
90 // Remove any style blocks. Some browsers do not work well with them in a contenteditable.
91 // Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
92 // Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
93 {regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
94
a7fdadc9
EM
95 // Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
96 {regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},
62467795
AN
97
98 // Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
a7fdadc9 99 // Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
d784f5ed 100 {regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link)[^>]*?>/gi, replace: ""}
62467795
AN
101 ];
102
3ef96361
EM
103 return this._filterContentWithRules(content, rules);
104 },
105
106 /**
107 * Take the supplied content and run on the supplied regex rules.
108 *
109 * @method _filterContentWithRules
110 * @private
111 * @param {String} content The content to clean
112 * @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
113 * @return {String} The cleaned content
114 */
115 _filterContentWithRules: function(content, rules) {
62467795
AN
116 var i = 0;
117 for (i = 0; i < rules.length; i++) {
118 content = content.replace(rules[i].regex, rules[i].replace);
119 }
120
121 return content;
a7fdadc9
EM
122 },
123
124 /**
125 * Intercept and clean html paste events.
126 *
127 * @method pasteCleanup
128 * @param {Object} sourceEvent The YUI EventFacade object
129 * @return {Boolean} True if the passed event should continue, false if not.
130 */
131 pasteCleanup: function(sourceEvent) {
132 // We only expect paste events, but we will check anyways.
133 if (sourceEvent.type === 'paste') {
134 // The YUI event wrapper doesn't provide paste event info, so we need the underlying event.
135 var event = sourceEvent._event;
136 // Check if we have a valid clipboardData object in the event.
137 // IE has a clipboard object at window.clipboardData, but as of IE 11, it does not provide HTML content access.
138 if (event && event.clipboardData && event.clipboardData.getData) {
139 // Check if there is HTML type to be pasted, this is all we care about.
140 var types = event.clipboardData.types;
141 var isHTML = false;
142 // Different browsers use different things to hold the types, so test various functions.
143 if (!types) {
144 isHTML = false;
145 } else if (typeof types.contains === 'function') {
146 isHTML = types.contains('text/html');
147 } else if (typeof types.indexOf === 'function') {
148 isHTML = (types.indexOf('text/html') > -1);
149 if (!isHTML) {
150 if ((types.indexOf('com.apple.webarchive') > -1) || (types.indexOf('com.apple.iWork.TSPNativeData') > -1)) {
151 // This is going to be a specialized Apple paste paste. We cannot capture this, so clean everything.
152 this.fallbackPasteCleanupDelayed();
153 return true;
154 }
155 }
156 } else {
157 // We don't know how to handle the clipboard info, so wait for the clipboard event to finish then fallback.
158 this.fallbackPasteCleanupDelayed();
159 return true;
160 }
161
162 if (isHTML) {
163 // Get the clipboard content.
164 var content;
165 try {
166 content = event.clipboardData.getData('text/html');
167 } catch (error) {
168 // Something went wrong. Fallback.
169 this.fallbackPasteCleanupDelayed();
170 return true;
171 }
172
173 // Stop the original paste.
174 sourceEvent.preventDefault();
175
176 // Scrub the paste content.
3ef96361 177 content = this._cleanPasteHTML(content);
a7fdadc9
EM
178
179 // Save the current selection.
180 // Using saveSelection as it produces a more consistent experience.
181 var selection = window.rangy.saveSelection();
182
183 // Insert the content.
184 this.insertContentAtFocusPoint(content);
185
186 // Restore the selection, and collapse to end.
187 window.rangy.restoreSelection(selection);
188 window.rangy.getSelection().collapseToEnd();
189
190 // Update the text area.
191 this.updateOriginal();
192 return false;
193 } else {
194 // This is a non-html paste event, we can just let this continue on and call updateOriginalDelayed.
195 this.updateOriginalDelayed();
196 return true;
197 }
198 } else {
199 // If we reached a here, this probably means the browser has limited (or no) clipboard support.
200 // Wait for the clipboard event to finish then fallback.
201 this.fallbackPasteCleanupDelayed();
202 return true;
203 }
204 }
205
206 // We should never get here - we must have received a non-paste event for some reason.
207 // Um, just call updateOriginalDelayed() - it's safe.
208 this.updateOriginalDelayed();
209 return true;
210 },
211
212 /**
213 * Cleanup code after a paste event if we couldn't intercept the paste content.
214 *
215 * @method fallbackPasteCleanup
216 * @chainable
217 */
218 fallbackPasteCleanup: function() {
219 Y.log('Using fallbackPasteCleanup for atto cleanup', 'debug', LOGNAME);
220
221 // Save the current selection (cursor position).
222 var selection = window.rangy.saveSelection();
223
224 // Get, clean, and replace the content in the editable.
225 var content = this.editor.get('innerHTML');
3ef96361 226 this.editor.set('innerHTML', this._cleanPasteHTML(content));
a7fdadc9
EM
227
228 // Update the textarea.
229 this.updateOriginal();
230
231 // Restore the selection (cursor position).
232 window.rangy.restoreSelection(selection);
233
234 return this;
235 },
236
237 /**
238 * Calls fallbackPasteCleanup on a short timer to allow the paste event handlers to complete.
239 *
240 * @method fallbackPasteCleanupDelayed
241 * @chainable
242 */
243 fallbackPasteCleanupDelayed: function() {
244 Y.soon(Y.bind(this.fallbackPasteCleanup, this));
245
246 return this;
3ef96361
EM
247 },
248
249 /**
250 * Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
251 *
252 * @method _cleanPasteHTML
253 * @private
254 * @param {String} content The html content to clean
255 * @return {String} The cleaned HTML
256 */
257 _cleanPasteHTML: function(content) {
258 // Return an empty string if passed an invalid or empty object.
259 if (!content || content.length === 0) {
260 return "";
261 }
262
263 // Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
264 var rules = [
6ea68e23 265 // Stuff that is specifically from MS Word and similar office packages.
cfb32192
DM
266 // Remove all garbage after closing html tag.
267 {regex: /<\s*\/html\s*>([\s\S]+)$/gi, replace: ""},
6ea68e23
EM
268 // Remove if comment blocks.
269 {regex: /<!--\[if[\s\S]*?endif\]-->/gi, replace: ""},
270 // Remove start and end fragment comment blocks.
271 {regex: /<!--(Start|End)Fragment-->/gi, replace: ""},
3ef96361
EM
272 // Remove any xml blocks.
273 {regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
274 // Remove any <?xml><\?xml> blocks.
275 {regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
276 // Remove <o:blah>, <\o:blah>.
df7a9fd4 277 {regex: /<\/?\w+:[^>]*>/gi, replace: ""}
3ef96361
EM
278 ];
279
280 // Apply the first set of harsher rules.
281 content = this._filterContentWithRules(content, rules);
282
283 // Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
284 content = this._cleanHTML(content);
285
286 // Check if the string is empty or only contains whitespace.
287 if (content.length === 0 || !content.match(/\S/)) {
288 return content;
289 }
290
291 // Now we let the browser normalize the code by loading it into the DOM and then get the html back.
292 // This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
293 var holder = document.createElement('div');
294 holder.innerHTML = content;
295 content = holder.innerHTML;
296 // Free up the DOM memory.
297 holder.innerHTML = "";
298
299 // Run some more rules that care about quotes and whitespace.
300 rules = [
301 // Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
302 {regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
303 // Remove MSO classes in class attributes. Only removes one or more that appear in succession.
304 {regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
305 // Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
306 {regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
307 // Remove OLE_LINK# anchors that may litter the code.
308 {regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
665829ec
EM
309 // Remove empty spans, but not ones from Rangy.
310 {regex: /<span(?![^>]*?rangySelectionBoundary[^>]*?)[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""}
3ef96361
EM
311 ];
312
313 // Apply the rules.
314 content = this._filterContentWithRules(content, rules);
315
316 // Reapply the standard cleaner to the content.
317 content = this._cleanHTML(content);
318
319 return content;
62467795
AN
320 }
321};
d088a835 322
62467795 323Y.Base.mix(Y.M.editor_atto.Editor, [EditorClean]);