001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030import org.opencms.main.CmsLog; 031 032import java.io.ByteArrayInputStream; 033import java.io.ByteArrayOutputStream; 034import java.io.UnsupportedEncodingException; 035import java.util.Arrays; 036import java.util.Collections; 037import java.util.List; 038import java.util.Properties; 039import java.util.regex.Pattern; 040 041import org.apache.commons.logging.Log; 042 043import org.w3c.tidy.Tidy; 044 045/** 046 * HTML cleaner and pretty printer using JTidy.<p> 047 * 048 * Used to clean up HTML code (e.g. remove word tags) and optionally create XHTML from HTML.<p> 049 * 050 * @since 6.0.0 051 */ 052public class CmsHtmlConverterJTidy extends A_CmsHtmlConverter { 053 054 /** The log object for this class. */ 055 private static final Log LOG = CmsLog.getLog(CmsHtmlConverterJTidy.class); 056 057 /** List of default modes if none were specified explicitly. */ 058 private static final List<String> MODES_DEFAULT = Collections.unmodifiableList( 059 Arrays.asList(new String[] {CmsHtmlConverter.PARAM_ENABLED})); 060 061 /** Regular expression for cleanup. */ 062 String[] m_cleanupPatterns = { 063 "<o:p>.*(\\r\\n)*.*</o:p>", 064 "<o:p>.*(\\r\\n)*.*</O:p>", 065 "<\\?xml:.*(\\r\\n).*/>", 066 "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>", 067 "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>", 068 "<\\?xml:(.*(\\r\\n)).*/\\?>", 069 "<o:SmartTagType.*(\\r\\n)*.*/>", 070 "<o:smarttagtype.*(\\r\\n)*.*/>"}; 071 072 /** Patterns for cleanup. */ 073 Pattern[] m_clearStyle; 074 075 /** Regular expressions for paragraph replacements -- additionally remove leading and trailing breaks. */ 076 String[] m_replaceParagraphPatterns = { 077 "</ul>\n<br />", 078 "</ol>\n<br />", 079 "<p><br />", 080 "<p>", 081 "<br />(\\s)* (\\s)*</p>", 082 "<br /></p>", 083 "</p>", 084 "^<br />", 085 "<br />$"}; 086 087 /** Values for paragraph replacements. */ 088 String[] m_replaceParagraphValues = {"</ul>", "</ol>", "<br />", "<br />", "<br />", "<br />", "<br />", "", ""}; 089 090 /** Regular expression for replace. */ 091 String[] m_replacePatterns = { 092 " ", 093 "(\\r\\n){2,}", 094 "\u2013", 095 "(\\n){2,}", 096 "\\(\\r\\n<", 097 "\\(\\n<", 098 "\\(\\r\\n(\\ ){1,}<", 099 "\\(\\n(\\ ){1,}<", 100 "\\r\\n<span", 101 "\\n<span"}; 102 103 /** Patterns for replace. */ 104 Pattern[] m_replaceStyle; 105 106 /** Values for replace. */ 107 String[] m_replaceValues = {" ", "", "–", "", "(<", "(<", "(<", "(<", "<span", "<span"}; 108 109 /** The tidy to use. */ 110 Tidy m_tidy; 111 112 /** The length of the line separator. */ 113 private int m_lineSeparatorLength; 114 115 /** Indicates if this converter is enabled or not. */ 116 private boolean m_modeEnabled; 117 118 /** Indicates if paragraph replacement mode is enabled or not. */ 119 private boolean m_modeReplaceParagraphs; 120 121 /** Indicates if word cleanup mode is enabled or not. */ 122 private boolean m_modeWord; 123 124 /** Indicates if XHTML conversion mode is enabled or not. */ 125 private boolean m_modeXhtml; 126 127 /** 128 * Constructor, creates a new CmsHtmlConverterJTidy.<p> 129 */ 130 public CmsHtmlConverterJTidy() { 131 132 super(null, MODES_DEFAULT); 133 } 134 135 /** 136 * Constructor, creates a new CmsHtmlConverterJTidy.<p> 137 * 138 * Possible values for the conversion mode are:<ul> 139 * <li>{@link CmsHtmlConverter#PARAM_DISABLED}: The conversion is disabled. 140 * <li>{@link CmsHtmlConverter#PARAM_ENABLED}: Conversion is enabled without transformation, so HTML is pretty printed only. 141 * <li>{@link CmsHtmlConverter#PARAM_XHTML}: Conversion from HTML to XHTML is enabled. 142 * <li>{@link CmsHtmlConverter#PARAM_WORD}: Cleanup of word like HTML tags is enabled. 143 * <li>{@link CmsHtmlConverter#PARAM_REPLACE_PARAGRAPHS}: Cleanup of paragraphs and leading/trailing line breaks is enabled. 144 * 145 * </ul> 146 * 147 * @param encoding the encoding used for the HTML code conversion 148 * @param modes the conversion modes to use 149 */ 150 public CmsHtmlConverterJTidy(String encoding, List<String> modes) { 151 152 super(encoding, modes); 153 } 154 155 /** 156 * Converts the given HTML code according to the settings of this converter.<p> 157 * 158 * @param htmlInput HTML input stored in a string 159 * @return string containing the converted HTML 160 * 161 * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported 162 */ 163 @Override 164 public String convertToString(String htmlInput) throws UnsupportedEncodingException { 165 166 // initialize the modes 167 initModes(); 168 // only do parsing if the mode is not set to disabled 169 if (m_modeEnabled) { 170 171 // do a maximum of 10 loops 172 int max = m_modeWord ? 10 : 1; 173 int count = 0; 174 175 // we may have to do several parsing runs until all tags are removed 176 int oldSize = htmlInput.length(); 177 String workHtml = regExp(htmlInput); 178 while (count < max) { 179 count++; 180 181 // first add the optional header if in word mode 182 if (m_modeWord) { 183 workHtml = adjustHtml(workHtml); 184 } 185 // now use tidy to parse and format the HTML 186 workHtml = parse(workHtml); 187 if (m_modeWord) { 188 // cut off the line separator, which is always appended 189 workHtml = workHtml.substring(0, workHtml.length() - m_lineSeparatorLength); 190 } 191 192 if (workHtml.length() == oldSize) { 193 // no change in HTML code after last processing loop 194 workHtml = regExp(workHtml); 195 break; 196 } 197 oldSize = workHtml.length(); 198 workHtml = regExp(workHtml); 199 } 200 if (LOG.isDebugEnabled()) { 201 LOG.debug( 202 Messages.get().getBundle().key( 203 Messages.LOG_PARSING_RUNS_2, 204 this.getClass().getName(), 205 Integer.valueOf(count))); 206 } 207 htmlInput = workHtml; 208 } 209 210 return htmlInput; 211 } 212 213 /** 214 * Adjusts the HTML input code in WORD mode if necessary.<p> 215 * 216 * When in WORD mode, the HTML tag must contain the xmlns:o="urn:schemas-microsoft-com:office:office" 217 * attribute, otherwise tide will not remove the WORD tags from the document. 218 * 219 * @param htmlInput the HTML input 220 * @return adjusted HTML input 221 */ 222 private String adjustHtml(String htmlInput) { 223 224 // check if we have some opening and closing HTML tags 225 if ((htmlInput.toLowerCase().indexOf("<html>") == -1) && (htmlInput.toLowerCase().indexOf("</html>") == -1)) { 226 // add a correct HTML tag for word generated HTML 227 StringBuffer tmp = new StringBuffer(); 228 tmp.append("<html xmlns:o=\"\"><body>"); 229 tmp.append(htmlInput); 230 tmp.append("</body></html>"); 231 htmlInput = tmp.toString(); 232 } 233 return htmlInput; 234 } 235 236 /** 237 * Initializes the JTidy modes.<p> 238 */ 239 private void initModes() { 240 241 // set all internal modes to disabled 242 m_modeEnabled = false; 243 m_modeReplaceParagraphs = false; 244 m_modeWord = false; 245 m_modeXhtml = false; 246 247 // extract all operation modes 248 List<String> modes = getModes(); 249 250 // configure the tidy depending on the operation mode 251 if (modes.contains(CmsHtmlConverter.PARAM_ENABLED)) { 252 m_modeEnabled = true; 253 } 254 if (modes.contains(CmsHtmlConverter.PARAM_XHTML)) { 255 m_modeEnabled = true; 256 m_modeXhtml = true; 257 } 258 if (modes.contains(CmsHtmlConverter.PARAM_WORD)) { 259 m_modeEnabled = true; 260 m_modeWord = true; 261 } 262 if (modes.contains(CmsHtmlConverter.PARAM_REPLACE_PARAGRAPHS)) { 263 m_modeEnabled = true; 264 m_modeReplaceParagraphs = true; 265 } 266 267 // get line separator length 268 m_lineSeparatorLength = System.getProperty("line.separator").length(); 269 270 // we need this only if the conversion is enabled 271 if (m_modeEnabled) { 272 273 // create the main tidy object 274 m_tidy = new Tidy(); 275 276 // set specified word, XHTML conversion settings 277 m_tidy.setXHTML(m_modeXhtml); 278 m_tidy.setWord2000(m_modeWord); 279 280 // add additional tags 281 // those are required to handle word 2002 (and newer) documents 282 Properties additionalTags = new Properties(); 283 additionalTags.put("new-empty-tags", "o:smarttagtype"); 284 additionalTags.put("new-inline-tags", "o:smarttagtype"); 285 m_tidy.getConfiguration().addProps(additionalTags); 286 287 // set the default tidy configuration 288 289 // set the tidy encoding 290 m_tidy.setInputEncoding(getEncoding()); 291 m_tidy.setOutputEncoding(getEncoding()); 292 293 // disable the tidy meta element in output 294 m_tidy.setTidyMark(false); 295 // disable clean mode 296 m_tidy.setMakeClean(false); 297 // enable numeric entities 298 m_tidy.setNumEntities(true); 299 // create output of the body only 300 m_tidy.setPrintBodyOnly(true); 301 // disable URI fixing, because it breaks domain names with special characters (IDNs) in links when used in HTML fields 302 m_tidy.setFixUri(false); 303 // force output creation even if there are tidy errors 304 m_tidy.setForceOutput(true); 305 // set tidy to quiet mode to prevent output 306 m_tidy.setQuiet(true); 307 // disable warning output 308 m_tidy.setShowWarnings(false); 309 // allow comments in the output 310 m_tidy.setHideComments(false); 311 // set no line break before a <br> 312 m_tidy.setBreakBeforeBR(false); 313 // don't wrap attribute values 314 m_tidy.setWrapAttVals(false); 315 // warp lines after 100 chars 316 m_tidy.setWraplen(100); 317 // no indentation 318 m_tidy.setSpaces(0); 319 320 if (m_modeWord) { 321 // create the regular expression for cleanup, only used in word clean mode 322 m_clearStyle = new Pattern[m_cleanupPatterns.length]; 323 for (int i = 0; i < m_cleanupPatterns.length; i++) { 324 m_clearStyle[i] = Pattern.compile(m_cleanupPatterns[i]); 325 } 326 } 327 328 // add paragraph replacement regular expression and values if needed 329 if (m_modeReplaceParagraphs) { 330 // add the regular expression and values for paragraph replacements 331 String[] newPatterns = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length]; 332 String[] newValues = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length]; 333 System.arraycopy(m_replacePatterns, 0, newPatterns, 0, m_replacePatterns.length); 334 System.arraycopy( 335 m_replaceParagraphPatterns, 336 0, 337 newPatterns, 338 m_replacePatterns.length, 339 m_replaceParagraphPatterns.length); 340 System.arraycopy(m_replaceValues, 0, newValues, 0, m_replacePatterns.length); 341 System.arraycopy( 342 m_replaceParagraphValues, 343 0, 344 newValues, 345 m_replacePatterns.length, 346 m_replaceParagraphPatterns.length); 347 m_replacePatterns = newPatterns; 348 m_replaceValues = newValues; 349 } 350 351 // create the regular expression for replace 352 m_replaceStyle = new Pattern[m_replacePatterns.length]; 353 for (int i = 0; i < m_replacePatterns.length; i++) { 354 m_replaceStyle[i] = Pattern.compile(m_replacePatterns[i]); 355 } 356 } 357 } 358 359 /** 360 * Parses a byte array containing HTML code with different parsing modes.<p> 361 * 362 * @param htmlInput a byte array containing raw HTML code 363 * 364 * @return parsed and cleared HTML code 365 * 366 * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported 367 */ 368 private String parse(String htmlInput) throws UnsupportedEncodingException { 369 370 // prepare the streams 371 ByteArrayInputStream in = new ByteArrayInputStream(htmlInput.getBytes(getEncoding())); 372 ByteArrayOutputStream out = new ByteArrayOutputStream(); 373 // do the parsing 374 m_tidy.parse(in, out); 375 // return the result 376 byte[] result = out.toByteArray(); 377 return new String(result, getEncoding()); 378 } 379 380 /** 381 * Parses the htmlInput with regular expressions for cleanup purposes.<p> 382 * 383 * @param htmlInput the HTML input 384 * 385 * @return the processed HTML 386 */ 387 private String regExp(String htmlInput) { 388 389 String parsedHtml = htmlInput.trim(); 390 391 if (m_modeWord) { 392 // process all cleanup regular expressions 393 for (int i = 0; i < m_cleanupPatterns.length; i++) { 394 parsedHtml = m_clearStyle[i].matcher(parsedHtml).replaceAll(""); 395 } 396 } 397 398 // process all replace regular expressions 399 for (int i = 0; i < m_replacePatterns.length; i++) { 400 parsedHtml = m_replaceStyle[i].matcher(parsedHtml).replaceAll(m_replaceValues[i]); 401 } 402 403 return parsedHtml; 404 } 405}