001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030import org.opencms.file.CmsObject; 031import org.opencms.file.CmsProperty; 032import org.opencms.file.CmsPropertyDefinition; 033import org.opencms.file.CmsResource; 034import org.opencms.i18n.CmsEncoder; 035import org.opencms.main.CmsException; 036import org.opencms.main.CmsLog; 037import org.opencms.main.OpenCms; 038 039import java.io.UnsupportedEncodingException; 040import java.util.ArrayList; 041import java.util.HashMap; 042import java.util.Iterator; 043import java.util.List; 044import java.util.Map; 045import java.util.Map.Entry; 046 047import org.apache.commons.logging.Log; 048 049/** 050 * HTML cleaner and pretty printer.<p> 051 * 052 * Used to clean up HTML code (e.g. remove word tags) and optionally create XHTML from HTML.<p> 053 * 054 * @since 6.0.0 055 */ 056public class CmsHtmlConverter { 057 058 /** Parameter value for disabled mode. **/ 059 public static final String PARAM_DISABLED = CmsStringUtil.FALSE; 060 061 /** Parameter value for enabled mode. **/ 062 public static final String PARAM_ENABLED = CmsStringUtil.TRUE; 063 064 /** Parameter value for replace paragraph mode. */ 065 public static final String PARAM_REPLACE_PARAGRAPHS = "replace-paragraphs"; 066 067 /** Parameter value for WORD mode. **/ 068 public static final String PARAM_WORD = "cleanup"; 069 070 /** Parameter value for XHTML mode. **/ 071 public static final String PARAM_XHTML = "xhtml"; 072 073 /** The separator used for the configured modes String. */ 074 public static final char SEPARATOR_MODES = ';'; 075 076 /** The log object for this class. */ 077 private static final Log LOG = CmsLog.getLog(CmsHtmlConverter.class); 078 079 /** The encoding used for the HTML code conversion. */ 080 private String m_encoding; 081 082 /** The conversion mode for the converter. */ 083 private String m_mode; 084 085 /** 086 * Constructor, creates a new CmsHtmlConverter.<p> 087 * 088 * The encoding used by default is {@link CmsEncoder#ENCODING_UTF_8}.<p> 089 */ 090 public CmsHtmlConverter() { 091 092 init(CmsEncoder.ENCODING_UTF_8, PARAM_ENABLED); 093 } 094 095 /** 096 * Constructor, creates a new CmsHtmlConverter.<p> 097 * 098 * Possible values for the default conversion mode are:<ul> 099 * <li>{@link #PARAM_DISABLED}: The conversion is disabled.</li> 100 * <li>{@link #PARAM_ENABLED}: Conversion is enabled without transformation, so HTML is pretty printed only.</li> 101 * <li>{@link #PARAM_XHTML}: Conversion from HTML to XHTML is enabled.</li> 102 * <li>{@link #PARAM_WORD}: Cleanup of word like HTML tags is enabled.</li> 103 * <li>Other values can be used by the implementing converter class.</li> 104 * </ul> 105 * Values can be combined with the <code>;</code> separator, so it is e.g. possible to convert 106 * to XHTML and clean from word at the same time.<p> 107 * 108 * @param encoding the encoding used for the HTML code conversion 109 * @param mode the conversion mode to use 110 */ 111 public CmsHtmlConverter(String encoding, String mode) { 112 113 init(encoding, mode); 114 } 115 116 /** 117 * Reads the content conversion property of a given resource and returns its value.<p> 118 * 119 * A default value (disabled) is returned if the property could not be read.<p> 120 * 121 * @param cms the CmsObject 122 * @param resource the resource in the VFS 123 * @return the content conversion property value 124 */ 125 public static String getConversionSettings(CmsObject cms, CmsResource resource) { 126 127 // read the content-conversion property 128 String contentConversion; 129 try { 130 String resourceName = cms.getSitePath(resource); 131 CmsProperty contentConversionProperty = cms.readPropertyObject( 132 resourceName, 133 CmsPropertyDefinition.PROPERTY_CONTENT_CONVERSION, 134 true); 135 contentConversion = contentConversionProperty.getValue(CmsHtmlConverter.PARAM_DISABLED); 136 } catch (CmsException e) { 137 // if there was an error reading the property, choose a default value 138 contentConversion = CmsHtmlConverter.PARAM_DISABLED; 139 } 140 return contentConversion; 141 } 142 143 /** 144 * Tests if the content conversion is enabled.<p> 145 * 146 * @param conversionMode the content conversion mode string 147 * @return true or false 148 */ 149 public static boolean isConversionEnabled(String conversionMode) { 150 151 boolean value = true; 152 if ((conversionMode == null) || (conversionMode.indexOf(PARAM_DISABLED) != -1)) { 153 value = false; 154 } 155 return value; 156 } 157 158 /** 159 * Converts the given HTML code according to the settings of this converter.<p> 160 * 161 * @param htmlInput HTML input stored in an array of bytes 162 * @return array of bytes containing the converted HTML 163 * 164 * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported 165 */ 166 public byte[] convertToByte(byte[] htmlInput) throws UnsupportedEncodingException { 167 168 return convertToByte(new String(htmlInput, getEncoding())); 169 } 170 171 /** 172 * Converts the given HTML code according to the settings of this converter.<p> 173 * 174 * @param htmlInput HTML input stored in a string 175 * @return array of bytes containing the converted HTML 176 * 177 * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported 178 */ 179 public byte[] convertToByte(String htmlInput) throws UnsupportedEncodingException { 180 181 return convertToString(htmlInput).getBytes(getEncoding()); 182 } 183 184 /** 185 * Converts the given HTML code according to the settings of this converter.<p> 186 * 187 * If an any error occurs during the conversion process, the original input is returned unmodified.<p> 188 * 189 * @param htmlInput HTML input stored in an array of bytes 190 * @return array of bytes containing the converted HTML 191 */ 192 public byte[] convertToByteSilent(byte[] htmlInput) { 193 194 try { 195 return convertToByte(htmlInput); 196 } catch (Exception e) { 197 if (LOG.isWarnEnabled()) { 198 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); 199 } 200 return htmlInput; 201 } 202 } 203 204 /** 205 * Converts the given HTML code according to the settings of this converter.<p> 206 * 207 * If an any error occurs during the conversion process, the original input is returned unmodified.<p> 208 * 209 * @param htmlInput HTML input stored in a string 210 * @return array of bytes containing the converted HTML 211 */ 212 public byte[] convertToByteSilent(String htmlInput) { 213 214 try { 215 return convertToByte(htmlInput.getBytes(getEncoding())); 216 } catch (Exception e) { 217 if (LOG.isWarnEnabled()) { 218 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); 219 } 220 try { 221 return htmlInput.getBytes(getEncoding()); 222 } catch (UnsupportedEncodingException e1) { 223 if (LOG.isWarnEnabled()) { 224 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1); 225 } 226 return htmlInput.getBytes(); 227 } 228 } 229 } 230 231 /** 232 * Converts the given HTML code according to the settings of this converter.<p> 233 * 234 * @param htmlInput HTML input stored in an array of bytes 235 * @return string containing the converted HTML 236 * 237 * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported 238 */ 239 public String convertToString(byte[] htmlInput) throws UnsupportedEncodingException { 240 241 return convertToString(new String(htmlInput, getEncoding())); 242 } 243 244 /** 245 * Converts the given HTML code according to the settings of the converter.<p> 246 * 247 * @param htmlInput HTML input stored in a string 248 * @return string containing the converted HTML 249 * 250 * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported 251 */ 252 public String convertToString(String htmlInput) throws UnsupportedEncodingException { 253 254 // first: collect all converter classes to use on the input 255 Map<String, List<String>> converters = new HashMap<String, List<String>>(); 256 for (Iterator<String> i = getModes().iterator(); i.hasNext();) { 257 String mode = i.next(); 258 String converterClass = OpenCms.getResourceManager().getHtmlConverter(mode); 259 List<String> modes = new ArrayList<String>(); 260 if (converters.containsKey(converterClass)) { 261 // converter class already defined for a previous mode, get mode list 262 modes = converters.get(converterClass); 263 } 264 // add mode name to list for the converter 265 modes.add(mode); 266 // store converter with modes in map 267 converters.put(converterClass, modes); 268 } 269 270 // second: convert the content with all found converter classes 271 for (Iterator<Entry<String, List<String>>> i = converters.entrySet().iterator(); i.hasNext();) { 272 Entry<String, List<String>> entry = i.next(); 273 String className = entry.getKey(); 274 List<String> modes = entry.getValue(); 275 try { 276 I_CmsHtmlConverter converter = (I_CmsHtmlConverter)Class.forName(className).newInstance(); 277 // initialize converter 278 converter.init(getEncoding(), modes); 279 // convert input String 280 htmlInput = converter.convertToString(htmlInput); 281 } catch (ClassNotFoundException e) { 282 LOG.error( 283 org.opencms.loader.Messages.get().getBundle().key( 284 org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1, 285 className), 286 e); 287 } catch (IllegalAccessException e) { 288 LOG.error( 289 org.opencms.loader.Messages.get().getBundle().key( 290 org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1, 291 className), 292 e); 293 } catch (InstantiationException e) { 294 LOG.error( 295 org.opencms.loader.Messages.get().getBundle().key( 296 org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1, 297 className), 298 e); 299 } 300 } 301 return htmlInput; 302 } 303 304 /** 305 * Converts the given HTML code according to the settings of this converter.<p> 306 * 307 * If an any error occurs during the conversion process, the original input is returned unmodified.<p> 308 * 309 * @param htmlInput HTML input stored in an array of bytes 310 * 311 * @return string containing the converted HTML 312 */ 313 public String convertToStringSilent(byte[] htmlInput) { 314 315 try { 316 return convertToString(htmlInput); 317 } catch (Exception e) { 318 if (LOG.isWarnEnabled()) { 319 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); 320 } 321 try { 322 return new String(htmlInput, getEncoding()); 323 } catch (UnsupportedEncodingException e1) { 324 if (LOG.isWarnEnabled()) { 325 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1); 326 } 327 return new String(htmlInput); 328 } 329 } 330 } 331 332 /** 333 * Converts the given HTML code according to the settings of this converter.<p> 334 * 335 * If an any error occurs during the conversion process, the original input is returned unmodified.<p> 336 * 337 * @param htmlInput HTML input stored in string 338 * 339 * @return string containing the converted HTML 340 */ 341 public String convertToStringSilent(String htmlInput) { 342 343 try { 344 return convertToString(htmlInput); 345 } catch (Exception e) { 346 if (LOG.isWarnEnabled()) { 347 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); 348 } 349 return htmlInput; 350 } 351 } 352 353 /** 354 * Returns the encoding used for the HTML code conversion.<p> 355 * 356 * @return the encoding used for the HTML code conversion 357 */ 358 public String getEncoding() { 359 360 return m_encoding; 361 } 362 363 /** 364 * Returns the conversion mode to use.<p> 365 * 366 * @return the conversion mode to use 367 */ 368 public String getMode() { 369 370 return m_mode; 371 } 372 373 /** 374 * Returns the conversion modes to use as List of String parameters.<p> 375 * 376 * @return the conversion modes to use as List of String parameters 377 */ 378 private List<String> getModes() { 379 380 List<String> modes = new ArrayList<String>(); 381 try { 382 modes = CmsStringUtil.splitAsList(getMode(), SEPARATOR_MODES, true); 383 } catch (Exception e) { 384 // error generating list, an empty list will be returned 385 } 386 387 return modes; 388 } 389 390 /** 391 * Initializes the HTML converter instance.<p> 392 * 393 * Possible values for the conversion mode are dependent from the converter implementation.<p> 394 * 395 * Values can be combined with the <code>;</code> separator, so that it is e.g. possible to convert 396 * to XHTML and clean from word at the same time.<p> 397 * 398 * @param encoding the encoding used for the HTML code conversion 399 * @param mode the conversion mode to use 400 */ 401 private void init(String encoding, String mode) { 402 403 if (encoding == null) { 404 m_encoding = CmsEncoder.ENCODING_UTF_8; 405 } else { 406 m_encoding = encoding; 407 } 408 if (CmsStringUtil.isEmptyOrWhitespaceOnly(mode)) { 409 m_mode = ""; 410 } else { 411 m_mode = mode; 412 } 413 } 414 415}