Send me more information about:

JS (Server) removal of MS Word HTML/XML

Function to strip Microsoft Word HTML formatting

As developers, we've all been there you give someone an HTML-enabled editor yet they STILL paste in content from Word, thereby ruining all your dedicated efforts at styling and making text look consistent and pretty (damn you and your Comic Sans, Bill!).

 

So, bearing this in mind, we put together a wee script which removes all the standard formatting of the mso:xml/html and other formatting which Microsoft Word stuffs in to retain the formatted document. Don't get me wrong, Word is very good at what it was designed for writing word documents not creating web pages!

 

The function explained: basically in the function below, we pass the main body of the text or passage into the js function (note that we are running this on the server to get the database elements parsed prior to passing it to the client) then, in a linear fashion systematically to do a series of "replaces". The nature of the replacing of some lines has experienced difficulties in earlier versions of Internet Explorer hence the addition of some RegEx functions, merely a belt-and-braces approach to ensure that the block of text can be fully processed.

 

Javascript Function

<script type="text/javascript" runat="server" language="javascript">
function CleanWordHTML( str )
{
str = str.replace(/<o:p>\s*<\/o:p>/g, "") ;
str = str.replace(/<o:p>.*?<\/o:p>/g, "&nbsp;") ;
str = str.replace( /\s*mso-[^:]+:[^;"]+;?/gi, "" ) ;
str = str.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, "" ) ;
str = str.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;
str = str.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, "" ) ;
str = str.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;
str = str.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;
str = str.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;
str = str.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;
str = str.replace( /\s*tab-stops:[^;"]*;?/gi, "" ) ;
str = str.replace( /\s*tab-stops:[^"]*/gi, "" ) ;
str = str.replace( /\s*face="[^"]*"/gi, "" ) ;
str = str.replace( /\s*face=[^ >]*/gi, "" ) ;
str = str.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, "" ) ;
str = str.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3") ;
str = str.replace( /<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;
str = str.replace( /\s*style="\s*"/gi, '' ) ;
str = str.replace( /<SPAN\s*[^>]*>\s*&nbsp;\s*<\/SPAN>/gi, '&nbsp;' ) ;
str = str.replace( /<SPAN\s*[^>]*><\/SPAN>/gi, '' ) ;
str = str.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;
str = str.replace( /<SPAN\s*>(.*?)<\/SPAN>/gi, '$1' ) ;
str = str.replace( /<FONT\s*>(.*?)<\/FONT>/gi, '$1' ) ;
str = str.replace(/<\\?\?xml[^>]*>/gi, "") ;
str = str.replace(/<\/?\w+:[^>]*>/gi, "") ;
str = str.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;
str = str.replace( /<H1([^>]*)>/gi, '' ) ;
str = str.replace( /<H2([^>]*)>/gi, '' ) ;
str = str.replace( /<H3([^>]*)>/gi, '' ) ;
str = str.replace( /<H4([^>]*)>/gi, '' ) ;
str = str.replace( /<H5([^>]*)>/gi, '' ) ;
str = str.replace( /<H6([^>]*)>/gi, '' ) ;
str = str.replace( /<\/H\d>/gi, '<br>' ) ; //remove this to take out breaks where Heading tags were
str = str.replace( /<(U|I|STRIKE)>&nbsp;<\/\1>/g, '&nbsp;' ) ;
str = str.replace( /<(B|b)>&nbsp;<\/\b|B>/g, '' ) ;
str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
//some RegEx code for the picky browsers
var re = new RegExp("(<P)([^>]*>.*?)(<\/P>)","gi") ;
str = str.replace( re, "<div$2</div>" ) ;
var re2 = new RegExp("(<font|<FONT)([^*>]*>.*?)(<\/FONT>|<\/font>)","gi") ;
str = str.replace( re2, "<div$2</div>") ;
str = str.replace( /size|SIZE = ([\d]{1})/g, '' ) ;

return str ;
}
</script>

 

Deploying the function (VBScript)

 

<%=CleanWordHTML(rs("textfieldfromdb"))%>

 

Mark Lawson November 2007