filter MS Word codes within HTML fields

Public

Function to remove MS Word markup from existing HTML fields. I use this when migrating dirty Drupal databases.

Get raw version
php
  1. // removes MS Office generated guff
  2. function strip_word_html($text, $allowed_tags = '<p><quote><b><i><sup><sub><em><strong><u><br>')
  3. {
  4. mb_regex_encoding('UTF-8');
  5. //replace MS special characters first
  6. $search = array('/&lsquo;/u', '/&rsquo;/u', '/&ldquo;/u', '/&rdquo;/u', '/&mdash;/u');
  7. $replace = array('\'', '\'', '"', '"', '-');
  8. $text = preg_replace($search, $replace, $text);
  9. //make sure _all_ html entities are converted to the plain ascii equivalents - it appears
  10. //in some MS headers, some html entities are encoded and some aren't
  11. $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
  12. //try to strip out any C style comments first, since these, embedded in html comments, seem to
  13. //prevent strip_tags from removing html comments (MS Word introduced combination)
  14. if(mb_stripos($text, '/*') !== FALSE){
  15. $text = mb_eregi_replace('#/\*.*?\*/#s', '', $text, 'm');
  16. }
  17. //introduce a space into any arithmetic expressions that could be caught by strip_tags so that they won't be
  18. //'<1' becomes '< 1'(note: somewhat application specific)
  19. $text = preg_replace(array('/<([0-9]+)/'), array('< $1'), $text);
  20. $text = strip_tags($text, $allowed_tags);
  21. //eliminate extraneous whitespace from start and end of line, or anywhere there are two or more spaces, convert it to one
  22. $text = preg_replace(array('/^\s\s+/', '/\s\s+$/', '/\s\s+/u'), array('', '', ' '), $text);
  23. //strip out inline css and simplify style tags
  24. $search = array('#<(strong|b)[^>]*>(.*?)</(strong|b)>#isu', '#<(em|i)[^>]*>(.*?)</(em|i)>#isu', '#<u[^>]*>(.*?)</u>#isu');
  25. $replace = array('<b>$2</b>', '<i>$2</i>', '<u>$1</u>');
  26. $text = preg_replace($search, $replace, $text);
  27. //on some of the ?newer MS Word exports, where you get conditionals of the form 'if gte mso 9', etc., it appears
  28. //that whatever is in one of the html comments prevents strip_tags from eradicating the html comment that contains
  29. //some MS Style Definitions - this last bit gets rid of any leftover comments */
  30. $text = preg_replace('/<!--(.|\s)*?-->/', '', $text);
  31. // This removes the classes left in paragraph marks
  32. $text = str_replace('class="MsoNormal"', '', $text);
  33. // Now remove any special charaters that were converted to characters
  34. $text = str_replace('aEU(tm)', "'", $text);
  35.  
  36. return $text;
  37. }