Merge branch 'MDL-47003-master' of git://github.com/merrill-oakland/moodle

2025-04-21 16:32:18 +02:00 · 2015-03-24 10:17:28 +08:00 · 2015-03-24 10:17:28 +08:00 · b6f7863e77
commit b6f7863e77
parent 6ae45106fc 3ef9636190
4 changed files with 312 additions and 99 deletions
--- a/lib/editor/atto/yui/build/moodle-editor_atto-editor/moodle-editor_atto-editor-debug.js
+++ b/lib/editor/atto/yui/build/moodle-editor_atto-editor/moodle-editor_atto-editor-debug.js
@ -1109,49 +1109,40 @@ EditorClean.prototype = {
     * @return {String} The cleaned HTML
     */
    _cleanHTML: function(content) {
-        // What are we doing ?
-        // We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
-        // We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
+        // Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.

        var rules = [
+            // Remove any style blocks. Some browsers do not work well with them in a contenteditable.
+            // Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
+            // Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
+            {regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
+
            // Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
            // Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
-
            // Remove all HTML comments.
            {regex: /<!--[\s\S]*?-->/gi, replace: ""},
-            // Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
-            // Remove <?xml>, <\?xml>.
-            {regex: /<\\?\?xml[^>]*>/gi, replace: ""},
-            // Remove <o:blah>, <\o:blah>.
-            {regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
-            // Remove MSO-blah, MSO:blah (e.g. in style attributes)
-            {regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
-            // Remove empty spans
-            {regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""},
-            // Remove class="Msoblah"
-            {regex: /class="Mso[^"]*"/gi, replace: ""},
+
            // Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
            {regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},

            // Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
            // Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
-            {regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
-
-            // Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
-            // Replace extended chars with simple text.
-            {regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
-            {regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
-            {regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
-            {regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
-            {regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
-            {regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
-            {regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
-            {regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
-            {regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
-            {regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
-            {regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
+            {regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
        ];

+        return this._filterContentWithRules(content, rules);
+    },
+
+    /**
+     * Take the supplied content and run on the supplied regex rules.
+     *
+     * @method _filterContentWithRules
+     * @private
+     * @param {String} content The content to clean
+     * @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
+     * @return {String} The cleaned content
+     */
+    _filterContentWithRules: function(content, rules) {
        var i = 0;
        for (i = 0; i < rules.length; i++) {
            content = content.replace(rules[i].regex, rules[i].replace);
@ -1213,7 +1204,7 @@ EditorClean.prototype = {
                    sourceEvent.preventDefault();

                    // Scrub the paste content.
-                    content = this._cleanHTML(content);
+                    content = this._cleanPasteHTML(content);

                    // Save the current selection.
                    // Using saveSelection as it produces a more consistent experience.
@ -1262,7 +1253,7 @@ EditorClean.prototype = {

        // Get, clean, and replace the content in the editable.
        var content = this.editor.get('innerHTML');
-        this.editor.set('innerHTML', this._cleanHTML(content));
+        this.editor.set('innerHTML', this._cleanPasteHTML(content));

        // Update the textarea.
        this.updateOriginal();
@ -1283,6 +1274,86 @@ EditorClean.prototype = {
        Y.soon(Y.bind(this.fallbackPasteCleanup, this));

        return this;
+    },
+
+    /**
+     * Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
+     *
+     * @method _cleanPasteHTML
+     * @private
+     * @param {String} content The html content to clean
+     * @return {String} The cleaned HTML
+     */
+    _cleanPasteHTML: function(content) {
+        // Return an empty string if passed an invalid or empty object.
+        if (!content || content.length === 0) {
+            return "";
+        }
+
+        // Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
+        var rules = [
+            // Remove any xml blocks.
+            {regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
+            // Remove any <?xml><\?xml> blocks.
+            {regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
+            // Remove <o:blah>, <\o:blah>.
+            {regex: /<\/?\w+:[^>]*>/gi, replace: ""},
+
+            // Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
+            // Replace extended chars with simple text.
+            {regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
+            {regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
+            {regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
+            {regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
+            {regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
+            {regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
+            {regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
+            {regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
+            {regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
+            {regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
+            {regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
+        ];
+
+        // Apply the first set of harsher rules.
+        content = this._filterContentWithRules(content, rules);
+
+        // Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
+        content = this._cleanHTML(content);
+
+        // Check if the string is empty or only contains whitespace.
+        if (content.length === 0 || !content.match(/\S/)) {
+            return content;
+        }
+
+        // Now we let the browser normalize the code by loading it into the DOM and then get the html back.
+        // This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
+        var holder = document.createElement('div');
+        holder.innerHTML = content;
+        content = holder.innerHTML;
+        // Free up the DOM memory.
+        holder.innerHTML = "";
+
+        // Run some more rules that care about quotes and whitespace.
+        rules = [
+            // Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
+            // Remove MSO classes in class attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
+            // Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
+            // Remove OLE_LINK# anchors that may litter the code.
+            {regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
+            // Remove empty spans.
+            {regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""}
+        ];
+
+        // Apply the rules.
+        content = this._filterContentWithRules(content, rules);
+
+        // Reapply the standard cleaner to the content.
+        content = this._cleanHTML(content);
+
+        return content;
    }
 };

--- a/lib/editor/atto/yui/build/moodle-editor_atto-editor/moodle-editor_atto-editor-min.js
+++ b/lib/editor/atto/yui/build/moodle-editor_atto-editor/moodle-editor_atto-editor-min.js
--- a/lib/editor/atto/yui/build/moodle-editor_atto-editor/moodle-editor_atto-editor.js
+++ b/lib/editor/atto/yui/build/moodle-editor_atto-editor/moodle-editor_atto-editor.js
@ -1099,49 +1099,40 @@ EditorClean.prototype = {
     * @return {String} The cleaned HTML
     */
    _cleanHTML: function(content) {
-        // What are we doing ?
-        // We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
-        // We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
+        // Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.

        var rules = [
+            // Remove any style blocks. Some browsers do not work well with them in a contenteditable.
+            // Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
+            // Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
+            {regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
+
            // Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
            // Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
-
            // Remove all HTML comments.
            {regex: /<!--[\s\S]*?-->/gi, replace: ""},
-            // Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
-            // Remove <?xml>, <\?xml>.
-            {regex: /<\\?\?xml[^>]*>/gi, replace: ""},
-            // Remove <o:blah>, <\o:blah>.
-            {regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
-            // Remove MSO-blah, MSO:blah (e.g. in style attributes)
-            {regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
-            // Remove empty spans
-            {regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""},
-            // Remove class="Msoblah"
-            {regex: /class="Mso[^"]*"/gi, replace: ""},
+
            // Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
            {regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},

            // Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
            // Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
-            {regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
-
-            // Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
-            // Replace extended chars with simple text.
-            {regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
-            {regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
-            {regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
-            {regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
-            {regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
-            {regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
-            {regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
-            {regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
-            {regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
-            {regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
-            {regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
+            {regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
        ];

+        return this._filterContentWithRules(content, rules);
+    },
+
+    /**
+     * Take the supplied content and run on the supplied regex rules.
+     *
+     * @method _filterContentWithRules
+     * @private
+     * @param {String} content The content to clean
+     * @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
+     * @return {String} The cleaned content
+     */
+    _filterContentWithRules: function(content, rules) {
        var i = 0;
        for (i = 0; i < rules.length; i++) {
            content = content.replace(rules[i].regex, rules[i].replace);
@ -1203,7 +1194,7 @@ EditorClean.prototype = {
                    sourceEvent.preventDefault();

                    // Scrub the paste content.
-                    content = this._cleanHTML(content);
+                    content = this._cleanPasteHTML(content);

                    // Save the current selection.
                    // Using saveSelection as it produces a more consistent experience.
@ -1251,7 +1242,7 @@ EditorClean.prototype = {

        // Get, clean, and replace the content in the editable.
        var content = this.editor.get('innerHTML');
-        this.editor.set('innerHTML', this._cleanHTML(content));
+        this.editor.set('innerHTML', this._cleanPasteHTML(content));

        // Update the textarea.
        this.updateOriginal();
@ -1272,6 +1263,86 @@ EditorClean.prototype = {
        Y.soon(Y.bind(this.fallbackPasteCleanup, this));

        return this;
+    },
+
+    /**
+     * Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
+     *
+     * @method _cleanPasteHTML
+     * @private
+     * @param {String} content The html content to clean
+     * @return {String} The cleaned HTML
+     */
+    _cleanPasteHTML: function(content) {
+        // Return an empty string if passed an invalid or empty object.
+        if (!content || content.length === 0) {
+            return "";
+        }
+
+        // Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
+        var rules = [
+            // Remove any xml blocks.
+            {regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
+            // Remove any <?xml><\?xml> blocks.
+            {regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
+            // Remove <o:blah>, <\o:blah>.
+            {regex: /<\/?\w+:[^>]*>/gi, replace: ""},
+
+            // Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
+            // Replace extended chars with simple text.
+            {regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
+            {regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
+            {regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
+            {regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
+            {regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
+            {regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
+            {regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
+            {regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
+            {regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
+            {regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
+            {regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
+        ];
+
+        // Apply the first set of harsher rules.
+        content = this._filterContentWithRules(content, rules);
+
+        // Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
+        content = this._cleanHTML(content);
+
+        // Check if the string is empty or only contains whitespace.
+        if (content.length === 0 || !content.match(/\S/)) {
+            return content;
+        }
+
+        // Now we let the browser normalize the code by loading it into the DOM and then get the html back.
+        // This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
+        var holder = document.createElement('div');
+        holder.innerHTML = content;
+        content = holder.innerHTML;
+        // Free up the DOM memory.
+        holder.innerHTML = "";
+
+        // Run some more rules that care about quotes and whitespace.
+        rules = [
+            // Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
+            // Remove MSO classes in class attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
+            // Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
+            // Remove OLE_LINK# anchors that may litter the code.
+            {regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
+            // Remove empty spans.
+            {regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""}
+        ];
+
+        // Apply the rules.
+        content = this._filterContentWithRules(content, rules);
+
+        // Reapply the standard cleaner to the content.
+        content = this._cleanHTML(content);
+
+        return content;
    }
 };

--- a/lib/editor/atto/yui/src/editor/js/clean.js
+++ b/lib/editor/atto/yui/src/editor/js/clean.js
@ -84,49 +84,40 @@ EditorClean.prototype = {
     * @return {String} The cleaned HTML
     */
    _cleanHTML: function(content) {
-        // What are we doing ?
-        // We are cleaning random HTML from all over the shop into a set of useful html suitable for content.
-        // We are allowing styles etc, but not e.g. font tags, class="MsoNormal" etc.
+        // Removing limited things that can break the page or a disallowed, like unclosed comments, style blocks, etc.

        var rules = [
+            // Remove any style blocks. Some browsers do not work well with them in a contenteditable.
+            // Plus style blocks are not allowed in body html, except with "scoped", which most browsers don't support as of 2015.
+            // Reference: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
+            {regex: /<style[^>]*>[\s\S]*?<\/style>/gi, replace: ""},
+
            // Source: "http://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript"
            // Source: "http://stackoverflow.com/questions/1068280/javascript-regex-multiline-flag-doesnt-work"
-
            // Remove all HTML comments.
            {regex: /<!--[\s\S]*?-->/gi, replace: ""},
-            // Source: "http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php"
-            // Remove <?xml>, <\?xml>.
-            {regex: /<\\?\?xml[^>]*>/gi, replace: ""},
-            // Remove <o:blah>, <\o:blah>.
-            {regex: /<\/?\w+:[^>]*>/gi, replace: ""}, // e.g. <o:p...
-            // Remove MSO-blah, MSO:blah (e.g. in style attributes)
-            {regex: /\s*MSO[-:][^;"']*;?/gi, replace: ""},
-            // Remove empty spans
-            {regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""},
-            // Remove class="Msoblah"
-            {regex: /class="Mso[^"]*"/gi, replace: ""},
+
            // Remove any open HTML comment opens that are not followed by a close. This can completely break page layout.
            {regex: /<!--(?![\s\S]*?-->)/gi, replace: ""},

            // Source: "http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html"
            // Remove forbidden tags for content, title, meta, style, st0-9, head, font, html, body, link.
-            {regex: /<(\/?title|\/?meta|\/?style|\/?st\d|\/?head|\/?font|\/?html|\/?body|\/?link|!\[)[^>]*?>/gi, replace: ""},
-
-            // Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
-            // Replace extended chars with simple text.
-            {regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
-            {regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
-            {regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
-            {regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
-            {regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
-            {regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
-            {regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
-            {regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
-            {regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
-            {regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
-            {regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
+            {regex: /<\/?(?:title|meta|style|st\d|head|font|html|body|link|!\[)[^>]*?>/gi, replace: ""}
        ];

+        return this._filterContentWithRules(content, rules);
+    },
+
+    /**
+     * Take the supplied content and run on the supplied regex rules.
+     *
+     * @method _filterContentWithRules
+     * @private
+     * @param {String} content The content to clean
+     * @param {Array} rules An array of structures: [ {regex: /something/, replace: "something"}, {...}, ...]
+     * @return {String} The cleaned content
+     */
+    _filterContentWithRules: function(content, rules) {
        var i = 0;
        for (i = 0; i < rules.length; i++) {
            content = content.replace(rules[i].regex, rules[i].replace);
@ -188,7 +179,7 @@ EditorClean.prototype = {
                    sourceEvent.preventDefault();

                    // Scrub the paste content.
-                    content = this._cleanHTML(content);
+                    content = this._cleanPasteHTML(content);

                    // Save the current selection.
                    // Using saveSelection as it produces a more consistent experience.
@ -237,7 +228,7 @@ EditorClean.prototype = {

        // Get, clean, and replace the content in the editable.
        var content = this.editor.get('innerHTML');
-        this.editor.set('innerHTML', this._cleanHTML(content));
+        this.editor.set('innerHTML', this._cleanPasteHTML(content));

        // Update the textarea.
        this.updateOriginal();
@ -258,6 +249,86 @@ EditorClean.prototype = {
        Y.soon(Y.bind(this.fallbackPasteCleanup, this));

        return this;
+    },
+
+    /**
+     * Cleanup html that comes from WYSIWYG paste events. These are more likely to contain messy code that we should strip.
+     *
+     * @method _cleanPasteHTML
+     * @private
+     * @param {String} content The html content to clean
+     * @return {String} The cleaned HTML
+     */
+    _cleanPasteHTML: function(content) {
+        // Return an empty string if passed an invalid or empty object.
+        if (!content || content.length === 0) {
+            return "";
+        }
+
+        // Rules that get rid of the real-nasties and don't care about normalize code (correct quotes, white spaces, etc).
+        var rules = [
+            // Remove any xml blocks.
+            {regex: /<xml[^>]*>[\s\S]*?<\/xml>/gi, replace: ""},
+            // Remove any <?xml><\?xml> blocks.
+            {regex: /<\?xml[^>]*>[\s\S]*?<\\\?xml>/gi, replace: ""},
+            // Remove <o:blah>, <\o:blah>.
+            {regex: /<\/?\w+:[^>]*>/gi, replace: ""},
+
+            // Source: "http://www.tim-jarrett.com/labs_javascript_scrub_word.php"
+            // Replace extended chars with simple text.
+            {regex: new RegExp(String.fromCharCode(8220), 'gi'), replace: '"'},
+            {regex: new RegExp(String.fromCharCode(8216), 'gi'), replace: "'"},
+            {regex: new RegExp(String.fromCharCode(8217), 'gi'), replace: "'"},
+            {regex: new RegExp(String.fromCharCode(8211), 'gi'), replace: '-'},
+            {regex: new RegExp(String.fromCharCode(8212), 'gi'), replace: '--'},
+            {regex: new RegExp(String.fromCharCode(189), 'gi'), replace: '1/2'},
+            {regex: new RegExp(String.fromCharCode(188), 'gi'), replace: '1/4'},
+            {regex: new RegExp(String.fromCharCode(190), 'gi'), replace: '3/4'},
+            {regex: new RegExp(String.fromCharCode(169), 'gi'), replace: '(c)'},
+            {regex: new RegExp(String.fromCharCode(174), 'gi'), replace: '(r)'},
+            {regex: new RegExp(String.fromCharCode(8230), 'gi'), replace: '...'}
+        ];
+
+        // Apply the first set of harsher rules.
+        content = this._filterContentWithRules(content, rules);
+
+        // Apply the standard rules, which mainly cleans things like headers, links, and style blocks.
+        content = this._cleanHTML(content);
+
+        // Check if the string is empty or only contains whitespace.
+        if (content.length === 0 || !content.match(/\S/)) {
+            return content;
+        }
+
+        // Now we let the browser normalize the code by loading it into the DOM and then get the html back.
+        // This gives us well quoted, well formatted code to continue our work on. Word may provide very poorly formatted code.
+        var holder = document.createElement('div');
+        holder.innerHTML = content;
+        content = holder.innerHTML;
+        // Free up the DOM memory.
+        holder.innerHTML = "";
+
+        // Run some more rules that care about quotes and whitespace.
+        rules = [
+            // Remove MSO-blah, MSO:blah in style attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?style\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[-:][^>;"]*;?)+/gi, replace: "$1"},
+            // Remove MSO classes in class attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*MSO[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
+            // Remove Apple- classes in class attributes. Only removes one or more that appear in succession.
+            {regex: /(<[^>]*?class\s*?=\s*?"[^>"]*?)(?:[\s]*Apple-[_a-zA-Z0-9\-]*)+/gi, replace: "$1"},
+            // Remove OLE_LINK# anchors that may litter the code.
+            {regex: /<a [^>]*?name\s*?=\s*?"OLE_LINK\d*?"[^>]*?>\s*?<\/a>/gi, replace: ""},
+            // Remove empty spans.
+            {regex: /<span[^>]*>(&nbsp;|\s)*<\/span>/gi, replace: ""}
+        ];
+
+        // Apply the rules.
+        content = this._filterContentWithRules(content, rules);
+
+        // Reapply the standard cleaner to the content.
+        content = this._cleanHTML(content);
+
+        return content;
    }
 };