001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.util;
029
030import org.opencms.main.CmsLog;
031
032import java.io.ByteArrayInputStream;
033import java.io.ByteArrayOutputStream;
034import java.io.UnsupportedEncodingException;
035import java.util.Arrays;
036import java.util.Collections;
037import java.util.List;
038import java.util.Properties;
039import java.util.regex.Pattern;
040
041import org.apache.commons.logging.Log;
042
043import org.w3c.tidy.Tidy;
044
045/**
046 * HTML cleaner and pretty printer using JTidy.<p>
047 *
048 * Used to clean up HTML code (e.g. remove word tags) and optionally create XHTML from HTML.<p>
049 *
050 * @since 6.0.0
051 */
052public class CmsHtmlConverterJTidy extends A_CmsHtmlConverter {
053
054    /** The log object for this class. */
055    private static final Log LOG = CmsLog.getLog(CmsHtmlConverterJTidy.class);
056
057    /** List of default modes if none were specified explicitly. */
058    private static final List<String> MODES_DEFAULT = Collections.unmodifiableList(
059        Arrays.asList(new String[] {CmsHtmlConverter.PARAM_ENABLED}));
060
061    /** Regular expression for cleanup. */
062    String[] m_cleanupPatterns = {
063        "<o:p>.*(\\r\\n)*.*</o:p>",
064        "<o:p>.*(\\r\\n)*.*</O:p>",
065        "<\\?xml:.*(\\r\\n).*/>",
066        "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>",
067        "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>",
068        "<\\?xml:(.*(\\r\\n)).*/\\?>",
069        "<o:SmartTagType.*(\\r\\n)*.*/>",
070        "<o:smarttagtype.*(\\r\\n)*.*/>"};
071
072    /** Patterns for cleanup. */
073    Pattern[] m_clearStyle;
074
075    /** Regular expressions for paragraph replacements -- additionally remove leading and trailing breaks. */
076    String[] m_replaceParagraphPatterns = {
077        "</ul>\n<br />",
078        "</ol>\n<br />",
079        "<p><br />",
080        "<p>",
081        "<br />(\\s)*&nbsp;(\\s)*</p>",
082        "<br /></p>",
083        "</p>",
084        "^<br />",
085        "<br />$"};
086
087    /** Values for paragraph replacements. */
088    String[] m_replaceParagraphValues = {"</ul>", "</ol>", "<br />", "<br />", "<br />", "<br />", "<br />", "", ""};
089
090    /** Regular expression for replace. */
091    String[] m_replacePatterns = {
092        "&#160;",
093        "(\\r\\n){2,}",
094        "\u2013",
095        "(\\n){2,}",
096        "\\(\\r\\n<",
097        "\\(\\n<",
098        "\\(\\r\\n(\\ ){1,}<",
099        "\\(\\n(\\ ){1,}<",
100        "\\r\\n<span",
101        "\\n<span"};
102
103    /** Patterns for replace. */
104    Pattern[] m_replaceStyle;
105
106    /** Values for replace. */
107    String[] m_replaceValues = {"&nbsp;", "", "&ndash;", "", "(<", "(<", "(<", "(<", "<span", "<span"};
108
109    /** The tidy to use. */
110    Tidy m_tidy;
111
112    /** The length of the line separator. */
113    private int m_lineSeparatorLength;
114
115    /** Indicates if this converter is enabled or not. */
116    private boolean m_modeEnabled;
117
118    /** Indicates if paragraph replacement mode is enabled or not. */
119    private boolean m_modeReplaceParagraphs;
120
121    /** Indicates if word cleanup mode is enabled or not. */
122    private boolean m_modeWord;
123
124    /** Indicates if XHTML conversion mode is enabled or not. */
125    private boolean m_modeXhtml;
126
127    /**
128     * Constructor, creates a new CmsHtmlConverterJTidy.<p>
129     */
130    public CmsHtmlConverterJTidy() {
131
132        super(null, MODES_DEFAULT);
133    }
134
135    /**
136     * Constructor, creates a new CmsHtmlConverterJTidy.<p>
137     *
138     * Possible values for the conversion mode are:<ul>
139     * <li>{@link CmsHtmlConverter#PARAM_DISABLED}: The conversion is disabled.
140     * <li>{@link CmsHtmlConverter#PARAM_ENABLED}: Conversion is enabled without transformation, so HTML is pretty printed only.
141     * <li>{@link CmsHtmlConverter#PARAM_XHTML}: Conversion from HTML to XHTML is enabled.
142     * <li>{@link CmsHtmlConverter#PARAM_WORD}: Cleanup of word like HTML tags is enabled.
143     * <li>{@link CmsHtmlConverter#PARAM_REPLACE_PARAGRAPHS}: Cleanup of paragraphs and leading/trailing line breaks is enabled.
144     *
145     * </ul>
146     *
147     * @param encoding the encoding used for the HTML code conversion
148     * @param modes the conversion modes to use
149     */
150    public CmsHtmlConverterJTidy(String encoding, List<String> modes) {
151
152        super(encoding, modes);
153    }
154
155    /**
156     * Converts the given HTML code according to the settings of this converter.<p>
157     *
158     * @param htmlInput HTML input stored in a string
159     * @return string containing the converted HTML
160     *
161     * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
162     */
163    @Override
164    public String convertToString(String htmlInput) throws UnsupportedEncodingException {
165
166        // initialize the modes
167        initModes();
168        // only do parsing if the mode is not set to disabled
169        if (m_modeEnabled) {
170
171            // do a maximum of 10 loops
172            int max = m_modeWord ? 10 : 1;
173            int count = 0;
174
175            // we may have to do several parsing runs until all tags are removed
176            int oldSize = htmlInput.length();
177            String workHtml = regExp(htmlInput);
178            while (count < max) {
179                count++;
180
181                // first add the optional header if in word mode
182                if (m_modeWord) {
183                    workHtml = adjustHtml(workHtml);
184                }
185                // now use tidy to parse and format the HTML
186                workHtml = parse(workHtml);
187                if (m_modeWord) {
188                    // cut off the line separator, which is always appended
189                    workHtml = workHtml.substring(0, workHtml.length() - m_lineSeparatorLength);
190                }
191
192                if (workHtml.length() == oldSize) {
193                    // no change in HTML code after last processing loop
194                    workHtml = regExp(workHtml);
195                    break;
196                }
197                oldSize = workHtml.length();
198                workHtml = regExp(workHtml);
199            }
200            if (LOG.isDebugEnabled()) {
201                LOG.debug(
202                    Messages.get().getBundle().key(
203                        Messages.LOG_PARSING_RUNS_2,
204                        this.getClass().getName(),
205                        Integer.valueOf(count)));
206            }
207            htmlInput = workHtml;
208        }
209
210        return htmlInput;
211    }
212
213    /**
214     * Adjusts the HTML input code in WORD mode if necessary.<p>
215     *
216     * When in WORD mode, the HTML tag must contain the xmlns:o="urn:schemas-microsoft-com:office:office"
217     * attribute, otherwise tide will not remove the WORD tags from the document.
218     *
219     * @param htmlInput the HTML input
220     * @return adjusted HTML input
221     */
222    private String adjustHtml(String htmlInput) {
223
224        // check if we have some opening and closing HTML tags
225        if ((htmlInput.toLowerCase().indexOf("<html>") == -1) && (htmlInput.toLowerCase().indexOf("</html>") == -1)) {
226            // add a correct HTML tag for word generated HTML
227            StringBuffer tmp = new StringBuffer();
228            tmp.append("<html xmlns:o=\"\"><body>");
229            tmp.append(htmlInput);
230            tmp.append("</body></html>");
231            htmlInput = tmp.toString();
232        }
233        return htmlInput;
234    }
235
236    /**
237     * Initializes the JTidy modes.<p>
238     */
239    private void initModes() {
240
241        // set all internal modes to disabled
242        m_modeEnabled = false;
243        m_modeReplaceParagraphs = false;
244        m_modeWord = false;
245        m_modeXhtml = false;
246
247        // extract all operation modes
248        List<String> modes = getModes();
249
250        // configure the tidy depending on the operation mode
251        if (modes.contains(CmsHtmlConverter.PARAM_ENABLED)) {
252            m_modeEnabled = true;
253        }
254        if (modes.contains(CmsHtmlConverter.PARAM_XHTML)) {
255            m_modeEnabled = true;
256            m_modeXhtml = true;
257        }
258        if (modes.contains(CmsHtmlConverter.PARAM_WORD)) {
259            m_modeEnabled = true;
260            m_modeWord = true;
261        }
262        if (modes.contains(CmsHtmlConverter.PARAM_REPLACE_PARAGRAPHS)) {
263            m_modeEnabled = true;
264            m_modeReplaceParagraphs = true;
265        }
266
267        // get line separator length
268        m_lineSeparatorLength = System.getProperty("line.separator").length();
269
270        // we need this only if the conversion is enabled
271        if (m_modeEnabled) {
272
273            // create the main tidy object
274            m_tidy = new Tidy();
275
276            // set specified word, XHTML conversion settings
277            m_tidy.setXHTML(m_modeXhtml);
278            m_tidy.setWord2000(m_modeWord);
279
280            // add additional tags
281            // those are required to handle word 2002 (and newer) documents
282            Properties additionalTags = new Properties();
283            additionalTags.put("new-empty-tags", "o:smarttagtype");
284            additionalTags.put("new-inline-tags", "o:smarttagtype");
285            m_tidy.getConfiguration().addProps(additionalTags);
286
287            // set the default tidy configuration
288
289            // set the tidy encoding
290            m_tidy.setInputEncoding(getEncoding());
291            m_tidy.setOutputEncoding(getEncoding());
292
293            // disable the tidy meta element in output
294            m_tidy.setTidyMark(false);
295            // disable clean mode
296            m_tidy.setMakeClean(false);
297            // enable numeric entities
298            m_tidy.setNumEntities(true);
299            // create output of the body only
300            m_tidy.setPrintBodyOnly(true);
301            // disable URI fixing, because it breaks domain names with special characters (IDNs) in links when used in HTML fields
302            m_tidy.setFixUri(false);
303            // force output creation even if there are tidy errors
304            m_tidy.setForceOutput(true);
305            // set tidy to quiet mode to prevent output
306            m_tidy.setQuiet(true);
307            // disable warning output
308            m_tidy.setShowWarnings(false);
309            // allow comments in the output
310            m_tidy.setHideComments(false);
311            // set no line break before a <br>
312            m_tidy.setBreakBeforeBR(false);
313            // don't wrap attribute values
314            m_tidy.setWrapAttVals(false);
315            // warp lines after 100 chars
316            m_tidy.setWraplen(100);
317            // no indentation
318            m_tidy.setSpaces(0);
319
320            if (m_modeWord) {
321                // create the regular expression for cleanup, only used in word clean mode
322                m_clearStyle = new Pattern[m_cleanupPatterns.length];
323                for (int i = 0; i < m_cleanupPatterns.length; i++) {
324                    m_clearStyle[i] = Pattern.compile(m_cleanupPatterns[i]);
325                }
326            }
327
328            // add paragraph replacement regular expression and values if needed
329            if (m_modeReplaceParagraphs) {
330                // add the regular expression and values for paragraph replacements
331                String[] newPatterns = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length];
332                String[] newValues = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length];
333                System.arraycopy(m_replacePatterns, 0, newPatterns, 0, m_replacePatterns.length);
334                System.arraycopy(
335                    m_replaceParagraphPatterns,
336                    0,
337                    newPatterns,
338                    m_replacePatterns.length,
339                    m_replaceParagraphPatterns.length);
340                System.arraycopy(m_replaceValues, 0, newValues, 0, m_replacePatterns.length);
341                System.arraycopy(
342                    m_replaceParagraphValues,
343                    0,
344                    newValues,
345                    m_replacePatterns.length,
346                    m_replaceParagraphPatterns.length);
347                m_replacePatterns = newPatterns;
348                m_replaceValues = newValues;
349            }
350
351            // create the regular expression for replace
352            m_replaceStyle = new Pattern[m_replacePatterns.length];
353            for (int i = 0; i < m_replacePatterns.length; i++) {
354                m_replaceStyle[i] = Pattern.compile(m_replacePatterns[i]);
355            }
356        }
357    }
358
359    /**
360     * Parses a byte array containing HTML code with different parsing modes.<p>
361     *
362     * @param htmlInput a byte array containing raw HTML code
363     *
364     * @return parsed and cleared HTML code
365     *
366     * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
367     */
368    private String parse(String htmlInput) throws UnsupportedEncodingException {
369
370        // prepare the streams
371        ByteArrayInputStream in = new ByteArrayInputStream(htmlInput.getBytes(getEncoding()));
372        ByteArrayOutputStream out = new ByteArrayOutputStream();
373        // do the parsing
374        m_tidy.parse(in, out);
375        // return the result
376        byte[] result = out.toByteArray();
377        return new String(result, getEncoding());
378    }
379
380    /**
381     * Parses the htmlInput with regular expressions for cleanup purposes.<p>
382     *
383     * @param htmlInput the HTML input
384     *
385     * @return the processed HTML
386     */
387    private String regExp(String htmlInput) {
388
389        String parsedHtml = htmlInput.trim();
390
391        if (m_modeWord) {
392            // process all cleanup regular expressions
393            for (int i = 0; i < m_cleanupPatterns.length; i++) {
394                parsedHtml = m_clearStyle[i].matcher(parsedHtml).replaceAll("");
395            }
396        }
397
398        // process all replace regular expressions
399        for (int i = 0; i < m_replacePatterns.length; i++) {
400            parsedHtml = m_replaceStyle[i].matcher(parsedHtml).replaceAll(m_replaceValues[i]);
401        }
402
403        return parsedHtml;
404    }
405}