001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.util;
029
030import org.opencms.file.CmsObject;
031import org.opencms.file.CmsProperty;
032import org.opencms.file.CmsPropertyDefinition;
033import org.opencms.file.CmsResource;
034import org.opencms.i18n.CmsEncoder;
035import org.opencms.main.CmsException;
036import org.opencms.main.CmsLog;
037import org.opencms.main.OpenCms;
038
039import java.io.UnsupportedEncodingException;
040import java.util.ArrayList;
041import java.util.HashMap;
042import java.util.Iterator;
043import java.util.List;
044import java.util.Map;
045import java.util.Map.Entry;
046
047import org.apache.commons.logging.Log;
048
049/**
050 * HTML cleaner and pretty printer.<p>
051 *
052 * Used to clean up HTML code (e.g. remove word tags) and optionally create XHTML from HTML.<p>
053 *
054 * @since 6.0.0
055 */
056public class CmsHtmlConverter {
057
058    /** Parameter value for disabled mode. **/
059    public static final String PARAM_DISABLED = CmsStringUtil.FALSE;
060
061    /** Parameter value for enabled mode. **/
062    public static final String PARAM_ENABLED = CmsStringUtil.TRUE;
063
064    /** Parameter value for replace paragraph mode. */
065    public static final String PARAM_REPLACE_PARAGRAPHS = "replace-paragraphs";
066
067    /** Parameter value for WORD mode. **/
068    public static final String PARAM_WORD = "cleanup";
069
070    /** Parameter value for XHTML mode. **/
071    public static final String PARAM_XHTML = "xhtml";
072
073    /** The separator used for the configured modes String. */
074    public static final char SEPARATOR_MODES = ';';
075
076    /** The log object for this class. */
077    private static final Log LOG = CmsLog.getLog(CmsHtmlConverter.class);
078
079    /** The encoding used for the HTML code conversion. */
080    private String m_encoding;
081
082    /** The conversion mode for the converter. */
083    private String m_mode;
084
085    /**
086     * Constructor, creates a new CmsHtmlConverter.<p>
087     *
088     * The encoding used by default is {@link CmsEncoder#ENCODING_UTF_8}.<p>
089     */
090    public CmsHtmlConverter() {
091
092        init(CmsEncoder.ENCODING_UTF_8, PARAM_ENABLED);
093    }
094
095    /**
096     * Constructor, creates a new CmsHtmlConverter.<p>
097     *
098     * Possible values for the default conversion mode are:<ul>
099     * <li>{@link #PARAM_DISABLED}: The conversion is disabled.</li>
100     * <li>{@link #PARAM_ENABLED}: Conversion is enabled without transformation, so HTML is pretty printed only.</li>
101     * <li>{@link #PARAM_XHTML}: Conversion from HTML to XHTML is enabled.</li>
102     * <li>{@link #PARAM_WORD}: Cleanup of word like HTML tags is enabled.</li>
103     * <li>Other values can be used by the implementing converter class.</li>
104     * </ul>
105     * Values can be combined with the <code>;</code> separator, so it is e.g. possible to convert
106     * to XHTML and clean from word at the same time.<p>
107     *
108     * @param encoding the encoding used for the HTML code conversion
109     * @param mode the conversion mode to use
110     */
111    public CmsHtmlConverter(String encoding, String mode) {
112
113        init(encoding, mode);
114    }
115
116    /**
117     * Reads the content conversion property of a given resource and returns its value.<p>
118     *
119     * A default value (disabled) is returned if the property could not be read.<p>
120     *
121     * @param cms the CmsObject
122     * @param resource the resource in the VFS
123     * @return the content conversion property value
124     */
125    public static String getConversionSettings(CmsObject cms, CmsResource resource) {
126
127        // read the content-conversion property
128        String contentConversion;
129        try {
130            String resourceName = cms.getSitePath(resource);
131            CmsProperty contentConversionProperty = cms.readPropertyObject(
132                resourceName,
133                CmsPropertyDefinition.PROPERTY_CONTENT_CONVERSION,
134                true);
135            contentConversion = contentConversionProperty.getValue(CmsHtmlConverter.PARAM_DISABLED);
136        } catch (CmsException e) {
137            // if there was an error reading the property, choose a default value
138            contentConversion = CmsHtmlConverter.PARAM_DISABLED;
139        }
140        return contentConversion;
141    }
142
143    /**
144     * Tests if the content conversion is enabled.<p>
145     *
146     * @param conversionMode the content conversion mode string
147     * @return true or false
148     */
149    public static boolean isConversionEnabled(String conversionMode) {
150
151        boolean value = true;
152        if ((conversionMode == null) || (conversionMode.indexOf(PARAM_DISABLED) != -1)) {
153            value = false;
154        }
155        return value;
156    }
157
158    /**
159     * Converts the given HTML code according to the settings of this converter.<p>
160     *
161     * @param htmlInput HTML input stored in an array of bytes
162     * @return array of bytes containing the converted HTML
163     *
164     * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
165     */
166    public byte[] convertToByte(byte[] htmlInput) throws UnsupportedEncodingException {
167
168        return convertToByte(new String(htmlInput, getEncoding()));
169    }
170
171    /**
172     * Converts the given HTML code according to the settings of this converter.<p>
173     *
174     * @param htmlInput HTML input stored in a string
175     * @return array of bytes containing the converted HTML
176     *
177     * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
178     */
179    public byte[] convertToByte(String htmlInput) throws UnsupportedEncodingException {
180
181        return convertToString(htmlInput).getBytes(getEncoding());
182    }
183
184    /**
185     * Converts the given HTML code according to the settings of this converter.<p>
186     *
187     * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
188     *
189     * @param htmlInput HTML input stored in an array of bytes
190     * @return array of bytes containing the converted HTML
191     */
192    public byte[] convertToByteSilent(byte[] htmlInput) {
193
194        try {
195            return convertToByte(htmlInput);
196        } catch (Exception e) {
197            if (LOG.isWarnEnabled()) {
198                LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
199            }
200            return htmlInput;
201        }
202    }
203
204    /**
205     * Converts the given HTML code according to the settings of this converter.<p>
206     *
207     * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
208     *
209     * @param htmlInput HTML input stored in a string
210     * @return array of bytes containing the converted HTML
211     */
212    public byte[] convertToByteSilent(String htmlInput) {
213
214        try {
215            return convertToByte(htmlInput.getBytes(getEncoding()));
216        } catch (Exception e) {
217            if (LOG.isWarnEnabled()) {
218                LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
219            }
220            try {
221                return htmlInput.getBytes(getEncoding());
222            } catch (UnsupportedEncodingException e1) {
223                if (LOG.isWarnEnabled()) {
224                    LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1);
225                }
226                return htmlInput.getBytes();
227            }
228        }
229    }
230
231    /**
232     * Converts the given HTML code according to the settings of this converter.<p>
233     *
234     * @param htmlInput HTML input stored in an array of bytes
235     * @return string containing the converted HTML
236     *
237     * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
238     */
239    public String convertToString(byte[] htmlInput) throws UnsupportedEncodingException {
240
241        return convertToString(new String(htmlInput, getEncoding()));
242    }
243
244    /**
245     * Converts the given HTML code according to the settings of the converter.<p>
246     *
247     * @param htmlInput HTML input stored in a string
248     * @return string containing the converted HTML
249     *
250     * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
251     */
252    public String convertToString(String htmlInput) throws UnsupportedEncodingException {
253
254        // first: collect all converter classes to use on the input
255        Map<String, List<String>> converters = new HashMap<String, List<String>>();
256        for (Iterator<String> i = getModes().iterator(); i.hasNext();) {
257            String mode = i.next();
258            String converterClass = OpenCms.getResourceManager().getHtmlConverter(mode);
259            List<String> modes = new ArrayList<String>();
260            if (converters.containsKey(converterClass)) {
261                // converter class already defined for a previous mode, get mode list
262                modes = converters.get(converterClass);
263            }
264            // add mode name to list for the converter
265            modes.add(mode);
266            // store converter with modes in map
267            converters.put(converterClass, modes);
268        }
269
270        // second: convert the content with all found converter classes
271        for (Iterator<Entry<String, List<String>>> i = converters.entrySet().iterator(); i.hasNext();) {
272            Entry<String, List<String>> entry = i.next();
273            String className = entry.getKey();
274            List<String> modes = entry.getValue();
275            try {
276                I_CmsHtmlConverter converter = (I_CmsHtmlConverter)Class.forName(className).newInstance();
277                // initialize converter
278                converter.init(getEncoding(), modes);
279                // convert input String
280                htmlInput = converter.convertToString(htmlInput);
281            } catch (ClassNotFoundException e) {
282                LOG.error(
283                    org.opencms.loader.Messages.get().getBundle().key(
284                        org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1,
285                        className),
286                    e);
287            } catch (IllegalAccessException e) {
288                LOG.error(
289                    org.opencms.loader.Messages.get().getBundle().key(
290                        org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1,
291                        className),
292                    e);
293            } catch (InstantiationException e) {
294                LOG.error(
295                    org.opencms.loader.Messages.get().getBundle().key(
296                        org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1,
297                        className),
298                    e);
299            }
300        }
301        return htmlInput;
302    }
303
304    /**
305     * Converts the given HTML code according to the settings of this converter.<p>
306     *
307     * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
308     *
309     * @param htmlInput HTML input stored in an array of bytes
310     *
311     * @return string containing the converted HTML
312     */
313    public String convertToStringSilent(byte[] htmlInput) {
314
315        try {
316            return convertToString(htmlInput);
317        } catch (Exception e) {
318            if (LOG.isWarnEnabled()) {
319                LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
320            }
321            try {
322                return new String(htmlInput, getEncoding());
323            } catch (UnsupportedEncodingException e1) {
324                if (LOG.isWarnEnabled()) {
325                    LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1);
326                }
327                return new String(htmlInput);
328            }
329        }
330    }
331
332    /**
333     * Converts the given HTML code according to the settings of this converter.<p>
334     *
335     * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
336     *
337     * @param htmlInput HTML input stored in string
338     *
339     * @return string containing the converted HTML
340     */
341    public String convertToStringSilent(String htmlInput) {
342
343        try {
344            return convertToString(htmlInput);
345        } catch (Exception e) {
346            if (LOG.isWarnEnabled()) {
347                LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
348            }
349            return htmlInput;
350        }
351    }
352
353    /**
354     * Returns the encoding used for the HTML code conversion.<p>
355     *
356     * @return the encoding used for the HTML code conversion
357     */
358    public String getEncoding() {
359
360        return m_encoding;
361    }
362
363    /**
364     * Returns the conversion mode to use.<p>
365     *
366     * @return the conversion mode to use
367     */
368    public String getMode() {
369
370        return m_mode;
371    }
372
373    /**
374     * Returns the conversion modes to use as List of String parameters.<p>
375     *
376     * @return the conversion modes to use as List of String parameters
377     */
378    private List<String> getModes() {
379
380        List<String> modes = new ArrayList<String>();
381        try {
382            modes = CmsStringUtil.splitAsList(getMode(), SEPARATOR_MODES, true);
383        } catch (Exception e) {
384            // error generating list, an empty list will be returned
385        }
386
387        return modes;
388    }
389
390    /**
391     * Initializes the HTML converter instance.<p>
392     *
393     * Possible values for the conversion mode are dependent from the converter implementation.<p>
394     *
395     * Values can be combined with the <code>;</code> separator, so that it is e.g. possible to convert
396     * to XHTML and clean from word at the same time.<p>
397     *
398     * @param encoding the encoding used for the HTML code conversion
399     * @param mode the conversion mode to use
400     */
401    private void init(String encoding, String mode) {
402
403        if (encoding == null) {
404            m_encoding = CmsEncoder.ENCODING_UTF_8;
405        } else {
406            m_encoding = encoding;
407        }
408        if (CmsStringUtil.isEmptyOrWhitespaceOnly(mode)) {
409            m_mode = "";
410        } else {
411            m_mode = mode;
412        }
413    }
414
415}