001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.util;
029
030import java.util.ArrayList;
031import java.util.Arrays;
032import java.util.Iterator;
033import java.util.List;
034
035import org.htmlparser.Parser;
036import org.htmlparser.PrototypicalNodeFactory;
037import org.htmlparser.Remark;
038import org.htmlparser.Tag;
039import org.htmlparser.Text;
040import org.htmlparser.lexer.Lexer;
041import org.htmlparser.lexer.Page;
042import org.htmlparser.util.ParserException;
043import org.htmlparser.visitors.NodeVisitor;
044
045/**
046 * Base utility class for OpenCms <code>{@link org.htmlparser.visitors.NodeVisitor}</code>
047 * implementations, which provides some often used utility functions.
048 * <p>
049 *
050 * This base implementation is only a "pass through" class, that is the content is parsed, but the
051 * generated result is exactly identical to the input.
052 * <p>
053 *
054 * @since 6.2.0
055 */
056public class CmsHtmlParser extends NodeVisitor implements I_CmsHtmlNodeVisitor {
057
058    /** List of upper case tag name strings of tags that should not be auto-corrected if closing divs are missing. */
059    protected List<String> m_noAutoCloseTags;
060
061    /** The array of supported tag names. */
062    // important: don't change the order of these tags in the source, subclasses may expect the tags
063    // at the exact indices give here
064    // if you want to add tags, add them at the end
065    protected static final String[] TAG_ARRAY = new String[] {
066        "H1",
067        "H2",
068        "H3",
069        "H4",
070        "H5",
071        "H6",
072        "P",
073        "DIV",
074        "SPAN",
075        "BR",
076        "OL",
077        "UL",
078        "LI",
079        "TABLE",
080        "TD",
081        "TR",
082        "TH",
083        "THEAD",
084        "TBODY",
085        "TFOOT"};
086
087    /** The list of supported tag names. */
088    protected static final List<String> TAG_LIST = Arrays.asList(TAG_ARRAY);
089
090    /** Indicates if "echo" mode is on, that is all content is written to the result by default. */
091    protected boolean m_echo;
092
093    /** The buffer to write the out to. */
094    protected StringBuffer m_result;
095
096    /** The providable configuration - never null by contract of interface. */
097    private String m_configuration = "";
098
099    /**
100     * Creates a new instance of the html converter with echo mode set to <code>false</code>.
101     * <p>
102     */
103    public CmsHtmlParser() {
104
105        this(false);
106    }
107
108    /**
109     * Creates a new instance of the html converter.
110     * <p>
111     *
112     * @param echo indicates if "echo" mode is on, that is all content is written to the result
113     */
114    public CmsHtmlParser(boolean echo) {
115
116        m_result = new StringBuffer(1024);
117        m_echo = echo;
118        m_noAutoCloseTags = new ArrayList<String>(32);
119    }
120
121    /**
122     * Internally degrades Composite tags that do have children in the DOM tree
123     * to simple single tags. This allows to avoid auto correction of unclosed HTML tags.<p>
124     *
125     * @return A node factory that will not autocorrect open tags specified via <code>{@link #setNoAutoCloseTags(List)}</code>
126     */
127    protected PrototypicalNodeFactory configureNoAutoCorrectionTags() {
128
129        PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
130
131        String tagName;
132        Iterator<String> it = m_noAutoCloseTags.iterator();
133        CmsNoAutoCloseTag noAutoCloseTag;
134        while (it.hasNext()) {
135            tagName = it.next();
136            noAutoCloseTag = new CmsNoAutoCloseTag(new String[] {tagName});
137            // TODO: This might break in case registering / unregistering  will change from name based to tag-type based approach:
138            factory.unregisterTag(noAutoCloseTag);
139            factory.registerTag(noAutoCloseTag);
140        }
141        return factory;
142    }
143
144    /**
145     * @see org.opencms.util.I_CmsHtmlNodeVisitor#getConfiguration()
146     */
147    public String getConfiguration() {
148
149        return m_configuration;
150    }
151
152    /**
153     * @see org.opencms.util.I_CmsHtmlNodeVisitor#getResult()
154     */
155    public String getResult() {
156
157        return m_result.toString();
158    }
159
160    /**
161     * Returns the HTML for the given tag itself (not the tag content).
162     * <p>
163     *
164     * @param tag the tag to create the HTML for
165     *
166     * @return the HTML for the given tag
167     */
168    public String getTagHtml(Tag tag) {
169
170        StringBuffer result = new StringBuffer(32);
171        result.append('<');
172        result.append(tag.getText());
173        result.append('>');
174        return result.toString();
175    }
176
177    /**
178     * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String)
179     */
180    public String process(String html, String encoding) throws ParserException {
181
182        m_result = new StringBuffer();
183        Parser parser = new Parser();
184        Lexer lexer = new Lexer();
185
186        // initialize the page with the given char set
187        Page page = new Page(html, encoding);
188        lexer.setPage(page);
189        parser.setLexer(lexer);
190
191        if ((m_noAutoCloseTags != null) && (m_noAutoCloseTags.size() > 0)) {
192            // Degrade Composite tags that do have children in the DOM tree
193            // to simple single tags: This allows to finish this tag with opened HTML tags without the effect
194            // that html parser will generate the closing tags.
195            PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
196            lexer.setNodeFactory(factory);
197        }
198
199        // process the page using the given visitor
200        parser.visitAllNodesWith(this);
201        // return the result
202        return getResult();
203    }
204
205    /**
206     *
207     * @see org.opencms.util.I_CmsHtmlNodeVisitor#setConfiguration(java.lang.String)
208     */
209    public void setConfiguration(String configuration) {
210
211        if (CmsStringUtil.isNotEmpty(configuration)) {
212            m_configuration = configuration;
213        }
214
215    }
216
217    /**
218     * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitEndTag(org.htmlparser.Tag)
219     */
220    @Override
221    public void visitEndTag(Tag tag) {
222
223        if (m_echo) {
224            m_result.append(getTagHtml(tag));
225        }
226    }
227
228    /**
229     * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitRemarkNode(org.htmlparser.Remark)
230     */
231    @Override
232    public void visitRemarkNode(Remark remark) {
233
234        if (m_echo) {
235            m_result.append(remark.toHtml(true));
236        }
237    }
238
239    /**
240     * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitStringNode(org.htmlparser.Text)
241     */
242    @Override
243    public void visitStringNode(Text text) {
244
245        if (m_echo) {
246            m_result.append(text.getText());
247        }
248    }
249
250    /**
251     * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitTag(org.htmlparser.Tag)
252     */
253    @Override
254    public void visitTag(Tag tag) {
255
256        if (m_echo) {
257            m_result.append(getTagHtml(tag));
258        }
259    }
260
261    /**
262     * Collapse HTML whitespace in the given String.<p>
263     *
264     * @param string the string to collapse
265     *
266     * @return the input String with all HTML whitespace collapsed
267     */
268    protected String collapse(String string) {
269
270        int len = string.length();
271        StringBuffer result = new StringBuffer(len);
272        int state = 0;
273        for (int i = 0; i < len; i++) {
274            char c = string.charAt(i);
275            switch (c) {
276                // see HTML specification section 9.1 White space
277                // http://www.w3.org/TR/html4/struct/text.html#h-9.1
278                case '\u0020':
279                case '\u0009':
280                case '\u000C':
281                case '\u200B':
282                case '\r':
283                case '\n':
284                    if (0 != state) {
285                        state = 1;
286                    }
287                    break;
288                default:
289                    if (1 == state) {
290                        result.append(' ');
291                    }
292                    state = 2;
293                    result.append(c);
294            }
295        }
296        return result.toString();
297    }
298
299    /**
300     * Returns a list of upper case tag names for which parsing / visiting will not correct missing closing tags.<p>
301     *
302     * @return a List of upper case tag names for which parsing / visiting will not correct missing closing tags
303     */
304    public List<String> getNoAutoCloseTags() {
305
306        return m_noAutoCloseTags;
307    }
308
309    /**
310     * Sets a list of upper case tag names for which parsing / visiting should not correct missing closing tags.<p>
311     *
312     * @param noAutoCloseTagList a list of upper case tag names for which parsing / visiting
313     *      should not correct missing closing tags to set.
314     */
315    public void setNoAutoCloseTags(List<String> noAutoCloseTagList) {
316
317        // ensuring upper case
318        m_noAutoCloseTags.clear();
319        if (noAutoCloseTagList != null) {
320            Iterator<String> it = noAutoCloseTagList.iterator();
321            while (it.hasNext()) {
322                m_noAutoCloseTags.add((it.next()).toUpperCase());
323            }
324        }
325    }
326}