mirror of
https://github.com/moodle/moodle.git
synced 2025-04-21 16:32:18 +02:00
Merge branch 'MDL-47003-master' of git://github.com/merrill-oakland/moodle
This commit is contained in:
commit
b6f7863e77
@ -1109,49 +1109,40 @@ EditorClean.prototype = {
|
||||
* @return {String} The cleaned HTML
|
||||
*/
|
||||
_cleanHTML: function(content) {
|
||||
// What are we doing ?
|
||||
// We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
|
||||
// We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
|
||||
// Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.
|
||||
|
||||
var rules = [
|
||||
// Remove any style blocks. Some browsers do not work well with them in a contenteditable.
|
||||
// Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
|
||||
// Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
|
||||
{regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
|
||||
|
||||
// Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
|
||||
// Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
|
||||
|
||||
// Remove all HTML comments.
|
||||
{regex: /<!--[\s\S]*?-->/gi, replace: ""},
|
||||
// Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
|
||||
// Remove <?xml>, <\?xml>.
|
||||
{regex: /<\\?\?xml[^>]*>/gi, replace: ""},
|
||||
// Remove <o:blah>, <\o:blah>.
|
||||
{regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
|
||||
// Remove MSO-blah, MSO:blah (e.g. in style attributes)
|
||||
{regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
|
||||
// Remove empty spans
|
||||
{regex: /<span[^>]*>( |\s)*<\/span>/gi, replace: ""},
|
||||
// Remove class="Msoblah"
|
||||
{regex: /class="Mso[^"]*"/gi, replace: ""},
|
||||
|
||||
// Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
|
||||
{regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
|
||||
// Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
|
||||
{regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
|
||||
// Replace extended chars with simple text.
|
||||
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
|
||||
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
|
||||
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
|
||||
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
|
||||
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
|
||||
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
|
||||
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
|
||||
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
|
||||
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
|
||||
{regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
|
||||
];
|
||||
|
||||
return this._filterContentWithRules(content, rules);
|
||||
},
|
||||
|
||||
/**
|
||||
* Take the supplied content and run on the supplied regex rules.
|
||||
*
|
||||
* @method _filterContentWithRules
|
||||
* @private
|
||||
* @param {String} content The content to clean
|
||||
* @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
|
||||
* @return {String} The cleaned content
|
||||
*/
|
||||
_filterContentWithRules: function(content, rules) {
|
||||
var i = 0;
|
||||
for (i = 0; i < rules.length; i++) {
|
||||
content = content.replace(rules[i].regex, rules[i].replace);
|
||||
@ -1213,7 +1204,7 @@ EditorClean.prototype = {
|
||||
sourceEvent.preventDefault();
|
||||
|
||||
// Scrub the paste content.
|
||||
content = this._cleanHTML(content);
|
||||
content = this._cleanPasteHTML(content);
|
||||
|
||||
// Save the current selection.
|
||||
// Using saveSelection as it produces a more consistent experience.
|
||||
@ -1262,7 +1253,7 @@ EditorClean.prototype = {
|
||||
|
||||
// Get, clean, and replace the content in the editable.
|
||||
var content = this.editor.get('innerHTML');
|
||||
this.editor.set('innerHTML', this._cleanHTML(content));
|
||||
this.editor.set('innerHTML', this._cleanPasteHTML(content));
|
||||
|
||||
// Update the textarea.
|
||||
this.updateOriginal();
|
||||
@ -1283,6 +1274,86 @@ EditorClean.prototype = {
|
||||
Y.soon(Y.bind(this.fallbackPasteCleanup, this));
|
||||
|
||||
return this;
|
||||
},
|
||||
|
||||
/**
|
||||
* Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
|
||||
*
|
||||
* @method _cleanPasteHTML
|
||||
* @private
|
||||
* @param {String} content The html content to clean
|
||||
* @return {String} The cleaned HTML
|
||||
*/
|
||||
_cleanPasteHTML: function(content) {
|
||||
// Return an empty string if passed an invalid or empty object.
|
||||
if (!content || content.length === 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
|
||||
var rules = [
|
||||
// Remove any xml blocks.
|
||||
{regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
|
||||
// Remove any <?xml><\?xml> blocks.
|
||||
{regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
|
||||
// Remove <o:blah>, <\o:blah>.
|
||||
{regex: /<\/?\w+:[^>]*>/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
|
||||
// Replace extended chars with simple text.
|
||||
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
|
||||
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
|
||||
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
|
||||
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
|
||||
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
|
||||
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
|
||||
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
|
||||
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
|
||||
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
|
||||
];
|
||||
|
||||
// Apply the first set of harsher rules.
|
||||
content = this._filterContentWithRules(content, rules);
|
||||
|
||||
// Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
|
||||
content = this._cleanHTML(content);
|
||||
|
||||
// Check if the string is empty or only contains whitespace.
|
||||
if (content.length === 0 || !content.match(/\S/)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
// Now we let the browser normalize the code by loading it into the DOM and then get the html back.
|
||||
// This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
|
||||
var holder = document.createElement('div');
|
||||
holder.innerHTML = content;
|
||||
content = holder.innerHTML;
|
||||
// Free up the DOM memory.
|
||||
holder.innerHTML = "";
|
||||
|
||||
// Run some more rules that care about quotes and whitespace.
|
||||
rules = [
|
||||
// Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
|
||||
// Remove MSO classes in class attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
|
||||
// Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
|
||||
// Remove OLE_LINK# anchors that may litter the code.
|
||||
{regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
|
||||
// Remove empty spans.
|
||||
{regex: /<span[^>]*>( |\s)*<\/span>/gi, replace: ""}
|
||||
];
|
||||
|
||||
// Apply the rules.
|
||||
content = this._filterContentWithRules(content, rules);
|
||||
|
||||
// Reapply the standard cleaner to the content.
|
||||
content = this._cleanHTML(content);
|
||||
|
||||
return content;
|
||||
}
|
||||
};
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
@ -1099,49 +1099,40 @@ EditorClean.prototype = {
|
||||
* @return {String} The cleaned HTML
|
||||
*/
|
||||
_cleanHTML: function(content) {
|
||||
// What are we doing ?
|
||||
// We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
|
||||
// We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
|
||||
// Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.
|
||||
|
||||
var rules = [
|
||||
// Remove any style blocks. Some browsers do not work well with them in a contenteditable.
|
||||
// Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
|
||||
// Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
|
||||
{regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
|
||||
|
||||
// Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
|
||||
// Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
|
||||
|
||||
// Remove all HTML comments.
|
||||
{regex: /<!--[\s\S]*?-->/gi, replace: ""},
|
||||
// Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
|
||||
// Remove <?xml>, <\?xml>.
|
||||
{regex: /<\\?\?xml[^>]*>/gi, replace: ""},
|
||||
// Remove <o:blah>, <\o:blah>.
|
||||
{regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
|
||||
// Remove MSO-blah, MSO:blah (e.g. in style attributes)
|
||||
{regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
|
||||
// Remove empty spans
|
||||
{regex: /<span[^>]*>( |\s)*<\/span>/gi, replace: ""},
|
||||
// Remove class="Msoblah"
|
||||
{regex: /class="Mso[^"]*"/gi, replace: ""},
|
||||
|
||||
// Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
|
||||
{regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
|
||||
// Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
|
||||
{regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
|
||||
// Replace extended chars with simple text.
|
||||
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
|
||||
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
|
||||
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
|
||||
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
|
||||
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
|
||||
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
|
||||
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
|
||||
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
|
||||
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
|
||||
{regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
|
||||
];
|
||||
|
||||
return this._filterContentWithRules(content, rules);
|
||||
},
|
||||
|
||||
/**
|
||||
* Take the supplied content and run on the supplied regex rules.
|
||||
*
|
||||
* @method _filterContentWithRules
|
||||
* @private
|
||||
* @param {String} content The content to clean
|
||||
* @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
|
||||
* @return {String} The cleaned content
|
||||
*/
|
||||
_filterContentWithRules: function(content, rules) {
|
||||
var i = 0;
|
||||
for (i = 0; i < rules.length; i++) {
|
||||
content = content.replace(rules[i].regex, rules[i].replace);
|
||||
@ -1203,7 +1194,7 @@ EditorClean.prototype = {
|
||||
sourceEvent.preventDefault();
|
||||
|
||||
// Scrub the paste content.
|
||||
content = this._cleanHTML(content);
|
||||
content = this._cleanPasteHTML(content);
|
||||
|
||||
// Save the current selection.
|
||||
// Using saveSelection as it produces a more consistent experience.
|
||||
@ -1251,7 +1242,7 @@ EditorClean.prototype = {
|
||||
|
||||
// Get, clean, and replace the content in the editable.
|
||||
var content = this.editor.get('innerHTML');
|
||||
this.editor.set('innerHTML', this._cleanHTML(content));
|
||||
this.editor.set('innerHTML', this._cleanPasteHTML(content));
|
||||
|
||||
// Update the textarea.
|
||||
this.updateOriginal();
|
||||
@ -1272,6 +1263,86 @@ EditorClean.prototype = {
|
||||
Y.soon(Y.bind(this.fallbackPasteCleanup, this));
|
||||
|
||||
return this;
|
||||
},
|
||||
|
||||
/**
|
||||
* Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
|
||||
*
|
||||
* @method _cleanPasteHTML
|
||||
* @private
|
||||
* @param {String} content The html content to clean
|
||||
* @return {String} The cleaned HTML
|
||||
*/
|
||||
_cleanPasteHTML: function(content) {
|
||||
// Return an empty string if passed an invalid or empty object.
|
||||
if (!content || content.length === 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
|
||||
var rules = [
|
||||
// Remove any xml blocks.
|
||||
{regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
|
||||
// Remove any <?xml><\?xml> blocks.
|
||||
{regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
|
||||
// Remove <o:blah>, <\o:blah>.
|
||||
{regex: /<\/?\w+:[^>]*>/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
|
||||
// Replace extended chars with simple text.
|
||||
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
|
||||
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
|
||||
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
|
||||
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
|
||||
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
|
||||
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
|
||||
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
|
||||
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
|
||||
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
|
||||
];
|
||||
|
||||
// Apply the first set of harsher rules.
|
||||
content = this._filterContentWithRules(content, rules);
|
||||
|
||||
// Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
|
||||
content = this._cleanHTML(content);
|
||||
|
||||
// Check if the string is empty or only contains whitespace.
|
||||
if (content.length === 0 || !content.match(/\S/)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
// Now we let the browser normalize the code by loading it into the DOM and then get the html back.
|
||||
// This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
|
||||
var holder = document.createElement('div');
|
||||
holder.innerHTML = content;
|
||||
content = holder.innerHTML;
|
||||
// Free up the DOM memory.
|
||||
holder.innerHTML = "";
|
||||
|
||||
// Run some more rules that care about quotes and whitespace.
|
||||
rules = [
|
||||
// Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
|
||||
// Remove MSO classes in class attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
|
||||
// Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
|
||||
// Remove OLE_LINK# anchors that may litter the code.
|
||||
{regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
|
||||
// Remove empty spans.
|
||||
{regex: /<span[^>]*>( |\s)*<\/span>/gi, replace: ""}
|
||||
];
|
||||
|
||||
// Apply the rules.
|
||||
content = this._filterContentWithRules(content, rules);
|
||||
|
||||
// Reapply the standard cleaner to the content.
|
||||
content = this._cleanHTML(content);
|
||||
|
||||
return content;
|
||||
}
|
||||
};
|
||||
|
||||
|
135
lib/editor/atto/yui/src/editor/js/clean.js
vendored
135
lib/editor/atto/yui/src/editor/js/clean.js
vendored
@ -84,49 +84,40 @@ EditorClean.prototype = {
|
||||
* @return {String} The cleaned HTML
|
||||
*/
|
||||
_cleanHTML: function(content) {
|
||||
// What are we doing ?
|
||||
// We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
|
||||
// We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
|
||||
// Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.
|
||||
|
||||
var rules = [
|
||||
// Remove any style blocks. Some browsers do not work well with them in a contenteditable.
|
||||
// Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
|
||||
// Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
|
||||
{regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
|
||||
|
||||
// Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
|
||||
// Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
|
||||
|
||||
// Remove all HTML comments.
|
||||
{regex: /<!--[\s\S]*?-->/gi, replace: ""},
|
||||
// Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
|
||||
// Remove <?xml>, <\?xml>.
|
||||
{regex: /<\\?\?xml[^>]*>/gi, replace: ""},
|
||||
// Remove <o:blah>, <\o:blah>.
|
||||
{regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
|
||||
// Remove MSO-blah, MSO:blah (e.g. in style attributes)
|
||||
{regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
|
||||
// Remove empty spans
|
||||
{regex: /<span[^>]*>( |\s)*<\/span>/gi, replace: ""},
|
||||
// Remove class="Msoblah"
|
||||
{regex: /class="Mso[^"]*"/gi, replace: ""},
|
||||
|
||||
// Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
|
||||
{regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
|
||||
// Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
|
||||
{regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
|
||||
// Replace extended chars with simple text.
|
||||
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
|
||||
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
|
||||
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
|
||||
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
|
||||
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
|
||||
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
|
||||
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
|
||||
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
|
||||
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
|
||||
{regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
|
||||
];
|
||||
|
||||
return this._filterContentWithRules(content, rules);
|
||||
},
|
||||
|
||||
/**
|
||||
* Take the supplied content and run on the supplied regex rules.
|
||||
*
|
||||
* @method _filterContentWithRules
|
||||
* @private
|
||||
* @param {String} content The content to clean
|
||||
* @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
|
||||
* @return {String} The cleaned content
|
||||
*/
|
||||
_filterContentWithRules: function(content, rules) {
|
||||
var i = 0;
|
||||
for (i = 0; i < rules.length; i++) {
|
||||
content = content.replace(rules[i].regex, rules[i].replace);
|
||||
@ -188,7 +179,7 @@ EditorClean.prototype = {
|
||||
sourceEvent.preventDefault();
|
||||
|
||||
// Scrub the paste content.
|
||||
content = this._cleanHTML(content);
|
||||
content = this._cleanPasteHTML(content);
|
||||
|
||||
// Save the current selection.
|
||||
// Using saveSelection as it produces a more consistent experience.
|
||||
@ -237,7 +228,7 @@ EditorClean.prototype = {
|
||||
|
||||
// Get, clean, and replace the content in the editable.
|
||||
var content = this.editor.get('innerHTML');
|
||||
this.editor.set('innerHTML', this._cleanHTML(content));
|
||||
this.editor.set('innerHTML', this._cleanPasteHTML(content));
|
||||
|
||||
// Update the textarea.
|
||||
this.updateOriginal();
|
||||
@ -258,6 +249,86 @@ EditorClean.prototype = {
|
||||
Y.soon(Y.bind(this.fallbackPasteCleanup, this));
|
||||
|
||||
return this;
|
||||
},
|
||||
|
||||
/**
|
||||
* Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
|
||||
*
|
||||
* @method _cleanPasteHTML
|
||||
* @private
|
||||
* @param {String} content The html content to clean
|
||||
* @return {String} The cleaned HTML
|
||||
*/
|
||||
_cleanPasteHTML: function(content) {
|
||||
// Return an empty string if passed an invalid or empty object.
|
||||
if (!content || content.length === 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
|
||||
var rules = [
|
||||
// Remove any xml blocks.
|
||||
{regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
|
||||
// Remove any <?xml><\?xml> blocks.
|
||||
{regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
|
||||
// Remove <o:blah>, <\o:blah>.
|
||||
{regex: /<\/?\w+:[^>]*>/gi, replace: ""},
|
||||
|
||||
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
|
||||
// Replace extended chars with simple text.
|
||||
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
|
||||
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
|
||||
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
|
||||
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
|
||||
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
|
||||
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
|
||||
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
|
||||
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
|
||||
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
|
||||
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
|
||||
];
|
||||
|
||||
// Apply the first set of harsher rules.
|
||||
content = this._filterContentWithRules(content, rules);
|
||||
|
||||
// Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
|
||||
content = this._cleanHTML(content);
|
||||
|
||||
// Check if the string is empty or only contains whitespace.
|
||||
if (content.length === 0 || !content.match(/\S/)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
// Now we let the browser normalize the code by loading it into the DOM and then get the html back.
|
||||
// This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
|
||||
var holder = document.createElement('div');
|
||||
holder.innerHTML = content;
|
||||
content = holder.innerHTML;
|
||||
// Free up the DOM memory.
|
||||
holder.innerHTML = "";
|
||||
|
||||
// Run some more rules that care about quotes and whitespace.
|
||||
rules = [
|
||||
// Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
|
||||
// Remove MSO classes in class attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
|
||||
// Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
|
||||
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
|
||||
// Remove OLE_LINK# anchors that may litter the code.
|
||||
{regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
|
||||
// Remove empty spans.
|
||||
{regex: /<span[^>]*>( |\s)*<\/span>/gi, replace: ""}
|
||||
];
|
||||
|
||||
// Apply the rules.
|
||||
content = this._filterContentWithRules(content, rules);
|
||||
|
||||
// Reapply the standard cleaner to the content.
|
||||
content = this._cleanHTML(content);
|
||||
|
||||
return content;
|
||||
}
|
||||
};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user