Merge branch 'MDL-47003-master' of git://github.com/merrill-oakland/moodle

This commit is contained in:
David Monllao 2015-03-24 10:17:28 +08:00
commit b6f7863e77
4 changed files with 312 additions and 99 deletions

View File

@ -1109,49 +1109,40 @@ EditorClean.prototype = {
* @return {String} The cleaned HTML
*/
_cleanHTML: function(content) {
// What are we doing ?
// We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
// We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
// Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.
var rules = [
// Remove any style blocks. Some browsers do not work well with them in a contenteditable.
// Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
// Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
{regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
// Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
// Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
// Remove all HTML comments.
{regex: /<!--[\s\S]*?-->/gi, replace: ""},
// Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
// Remove <?xml>, <\?xml>.
{regex: /<\\?\?xml[^>]*>/gi, replace: ""},
// Remove <o:blah>, <\o:blah>.
{regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
// Remove MSO-blah, MSO:blah (e.g. in style attributes)
{regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
// Remove empty spans
{regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""},
// Remove class="Msoblah"
{regex: /class="Mso[^"]*"/gi, replace: ""},
// Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
{regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},
// Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
// Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
{regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
// Replace extended chars with simple text.
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
{regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
];
return this._filterContentWithRules(content, rules);
},
/**
* Take the supplied content and run on the supplied regex rules.
*
* @method _filterContentWithRules
* @private
* @param {String} content The content to clean
* @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
* @return {String} The cleaned content
*/
_filterContentWithRules: function(content, rules) {
var i = 0;
for (i = 0; i < rules.length; i++) {
content = content.replace(rules[i].regex, rules[i].replace);
@ -1213,7 +1204,7 @@ EditorClean.prototype = {
sourceEvent.preventDefault();
// Scrub the paste content.
content = this._cleanHTML(content);
content = this._cleanPasteHTML(content);
// Save the current selection.
// Using saveSelection as it produces a more consistent experience.
@ -1262,7 +1253,7 @@ EditorClean.prototype = {
// Get, clean, and replace the content in the editable.
var content = this.editor.get('innerHTML');
this.editor.set('innerHTML', this._cleanHTML(content));
this.editor.set('innerHTML', this._cleanPasteHTML(content));
// Update the textarea.
this.updateOriginal();
@ -1283,6 +1274,86 @@ EditorClean.prototype = {
Y.soon(Y.bind(this.fallbackPasteCleanup, this));
return this;
},
/**
* Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
*
* @method _cleanPasteHTML
* @private
* @param {String} content The html content to clean
* @return {String} The cleaned HTML
*/
_cleanPasteHTML: function(content) {
// Return an empty string if passed an invalid or empty object.
if (!content || content.length === 0) {
return "";
}
// Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
var rules = [
// Remove any xml blocks.
{regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
// Remove any <?xml><\?xml> blocks.
{regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
// Remove <o:blah>, <\o:blah>.
{regex: /<\/?\w+:[^>]*>/gi, replace: ""},
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
// Replace extended chars with simple text.
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
];
// Apply the first set of harsher rules.
content = this._filterContentWithRules(content, rules);
// Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
content = this._cleanHTML(content);
// Check if the string is empty or only contains whitespace.
if (content.length === 0 || !content.match(/\S/)) {
return content;
}
// Now we let the browser normalize the code by loading it into the DOM and then get the html back.
// This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
var holder = document.createElement('div');
holder.innerHTML = content;
content = holder.innerHTML;
// Free up the DOM memory.
holder.innerHTML = "";
// Run some more rules that care about quotes and whitespace.
rules = [
// Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
// Remove MSO classes in class attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
// Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
// Remove OLE_LINK# anchors that may litter the code.
{regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
// Remove empty spans.
{regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""}
];
// Apply the rules.
content = this._filterContentWithRules(content, rules);
// Reapply the standard cleaner to the content.
content = this._cleanHTML(content);
return content;
}
};

File diff suppressed because one or more lines are too long

View File

@ -1099,49 +1099,40 @@ EditorClean.prototype = {
* @return {String} The cleaned HTML
*/
_cleanHTML: function(content) {
// What are we doing ?
// We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
// We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
// Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.
var rules = [
// Remove any style blocks. Some browsers do not work well with them in a contenteditable.
// Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
// Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
{regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
// Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
// Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
// Remove all HTML comments.
{regex: /<!--[\s\S]*?-->/gi, replace: ""},
// Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
// Remove <?xml>, <\?xml>.
{regex: /<\\?\?xml[^>]*>/gi, replace: ""},
// Remove <o:blah>, <\o:blah>.
{regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
// Remove MSO-blah, MSO:blah (e.g. in style attributes)
{regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
// Remove empty spans
{regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""},
// Remove class="Msoblah"
{regex: /class="Mso[^"]*"/gi, replace: ""},
// Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
{regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},
// Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
// Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
{regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
// Replace extended chars with simple text.
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
{regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
];
return this._filterContentWithRules(content, rules);
},
/**
* Take the supplied content and run on the supplied regex rules.
*
* @method _filterContentWithRules
* @private
* @param {String} content The content to clean
* @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
* @return {String} The cleaned content
*/
_filterContentWithRules: function(content, rules) {
var i = 0;
for (i = 0; i < rules.length; i++) {
content = content.replace(rules[i].regex, rules[i].replace);
@ -1203,7 +1194,7 @@ EditorClean.prototype = {
sourceEvent.preventDefault();
// Scrub the paste content.
content = this._cleanHTML(content);
content = this._cleanPasteHTML(content);
// Save the current selection.
// Using saveSelection as it produces a more consistent experience.
@ -1251,7 +1242,7 @@ EditorClean.prototype = {
// Get, clean, and replace the content in the editable.
var content = this.editor.get('innerHTML');
this.editor.set('innerHTML', this._cleanHTML(content));
this.editor.set('innerHTML', this._cleanPasteHTML(content));
// Update the textarea.
this.updateOriginal();
@ -1272,6 +1263,86 @@ EditorClean.prototype = {
Y.soon(Y.bind(this.fallbackPasteCleanup, this));
return this;
},
/**
* Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
*
* @method _cleanPasteHTML
* @private
* @param {String} content The html content to clean
* @return {String} The cleaned HTML
*/
_cleanPasteHTML: function(content) {
// Return an empty string if passed an invalid or empty object.
if (!content || content.length === 0) {
return "";
}
// Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
var rules = [
// Remove any xml blocks.
{regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
// Remove any <?xml><\?xml> blocks.
{regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
// Remove <o:blah>, <\o:blah>.
{regex: /<\/?\w+:[^>]*>/gi, replace: ""},
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
// Replace extended chars with simple text.
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
];
// Apply the first set of harsher rules.
content = this._filterContentWithRules(content, rules);
// Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
content = this._cleanHTML(content);
// Check if the string is empty or only contains whitespace.
if (content.length === 0 || !content.match(/\S/)) {
return content;
}
// Now we let the browser normalize the code by loading it into the DOM and then get the html back.
// This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
var holder = document.createElement('div');
holder.innerHTML = content;
content = holder.innerHTML;
// Free up the DOM memory.
holder.innerHTML = "";
// Run some more rules that care about quotes and whitespace.
rules = [
// Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
// Remove MSO classes in class attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
// Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
// Remove OLE_LINK# anchors that may litter the code.
{regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
// Remove empty spans.
{regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""}
];
// Apply the rules.
content = this._filterContentWithRules(content, rules);
// Reapply the standard cleaner to the content.
content = this._cleanHTML(content);
return content;
}
};

View File

@ -84,49 +84,40 @@ EditorClean.prototype = {
* @return {String} The cleaned HTML
*/
_cleanHTML: function(content) {
// What are we doing ?
// We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
// We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
// Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.
var rules = [
// Remove any style blocks. Some browsers do not work well with them in a contenteditable.
// Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
// Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
{regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
// Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
// Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
// Remove all HTML comments.
{regex: /<!--[\s\S]*?-->/gi, replace: ""},
// Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
// Remove <?xml>, <\?xml>.
{regex: /<\\?\?xml[^>]*>/gi, replace: ""},
// Remove <o:blah>, <\o:blah>.
{regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
// Remove MSO-blah, MSO:blah (e.g. in style attributes)
{regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
// Remove empty spans
{regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""},
// Remove class="Msoblah"
{regex: /class="Mso[^"]*"/gi, replace: ""},
// Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
{regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},
// Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
// Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
{regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
// Replace extended chars with simple text.
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
{regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
];
return this._filterContentWithRules(content, rules);
},
/**
* Take the supplied content and run on the supplied regex rules.
*
* @method _filterContentWithRules
* @private
* @param {String} content The content to clean
* @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
* @return {String} The cleaned content
*/
_filterContentWithRules: function(content, rules) {
var i = 0;
for (i = 0; i < rules.length; i++) {
content = content.replace(rules[i].regex, rules[i].replace);
@ -188,7 +179,7 @@ EditorClean.prototype = {
sourceEvent.preventDefault();
// Scrub the paste content.
content = this._cleanHTML(content);
content = this._cleanPasteHTML(content);
// Save the current selection.
// Using saveSelection as it produces a more consistent experience.
@ -237,7 +228,7 @@ EditorClean.prototype = {
// Get, clean, and replace the content in the editable.
var content = this.editor.get('innerHTML');
this.editor.set('innerHTML', this._cleanHTML(content));
this.editor.set('innerHTML', this._cleanPasteHTML(content));
// Update the textarea.
this.updateOriginal();
@ -258,6 +249,86 @@ EditorClean.prototype = {
Y.soon(Y.bind(this.fallbackPasteCleanup, this));
return this;
},
/**
* Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
*
* @method _cleanPasteHTML
* @private
* @param {String} content The html content to clean
* @return {String} The cleaned HTML
*/
_cleanPasteHTML: function(content) {
// Return an empty string if passed an invalid or empty object.
if (!content || content.length === 0) {
return "";
}
// Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
var rules = [
// Remove any xml blocks.
{regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
// Remove any <?xml><\?xml> blocks.
{regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
// Remove <o:blah>, <\o:blah>.
{regex: /<\/?\w+:[^>]*>/gi, replace: ""},
// Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
// Replace extended chars with simple text.
{regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
{regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
{regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
{regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
{regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
{regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
{regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
{regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
{regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
{regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
];
// Apply the first set of harsher rules.
content = this._filterContentWithRules(content, rules);
// Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
content = this._cleanHTML(content);
// Check if the string is empty or only contains whitespace.
if (content.length === 0 || !content.match(/\S/)) {
return content;
}
// Now we let the browser normalize the code by loading it into the DOM and then get the html back.
// This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
var holder = document.createElement('div');
holder.innerHTML = content;
content = holder.innerHTML;
// Free up the DOM memory.
holder.innerHTML = "";
// Run some more rules that care about quotes and whitespace.
rules = [
// Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
// Remove MSO classes in class attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
// Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
{regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
// Remove OLE_LINK# anchors that may litter the code.
{regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
// Remove empty spans.
{regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""}
];
// Apply the rules.
content = this._filterContentWithRules(content, rules);
// Reapply the standard cleaner to the content.
content = this._cleanHTML(content);
return content;
}
};