001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.util;
029
030import org.opencms.i18n.CmsMessageContainer;
031
032import java.util.ArrayList;
033import java.util.Collections;
034import java.util.HashSet;
035import java.util.List;
036import java.util.Set;
037import java.util.Stack;
038
039import org.htmlparser.Parser;
040import org.htmlparser.PrototypicalNodeFactory;
041import org.htmlparser.Tag;
042import org.htmlparser.lexer.Lexer;
043import org.htmlparser.lexer.Page;
044import org.htmlparser.util.ParserException;
045import org.htmlparser.visitors.NodeVisitor;
046
047/**
048 * Validates HTML.<p>
049 */
050public class CmsHtmlValidator extends NodeVisitor {
051
052    /** Void HTML elements that do not need to be closed. */
053    private static final Set<String> AUTOCLOSE_TAGS = new HashSet<String>();
054    /** Tags to override the HTMLParser composite tags, to avoid automatic closing of unbalanced tags. */
055    private static final String[] NO_AUTOCLOSE_TAGS = new String[] {
056        "APPLET",
057        "BLOCKQUOTE",
058        "BODY",
059        "LI",
060        "UL",
061        "OL",
062        "DL",
063        "DD",
064        "DT",
065        "DIV",
066        "FORM",
067        "FRAMESET",
068        "HTML",
069        "H1",
070        "H2",
071        "H3",
072        "H4",
073        "H5",
074        "H6",
075        "HEAD",
076        "LABEL",
077        "A",
078        "OBJECT",
079        "OPTION",
080        "P",
081        "SCRIPT",
082        "NOSCRIPT",
083        "SELECT",
084        "SPAN",
085        "STYLE",
086        "TD",
087        "TR",
088        "TBODY",
089        "TFOOT",
090        "THEAD",
091        "TEXTAREA",
092        "TITLE"};
093
094    static {
095        Collections.addAll(
096            AUTOCLOSE_TAGS,
097            "AREA",
098            "BASE",
099            "BR",
100            "COL",
101            "EMBED",
102            "HR",
103            "IMG",
104            "INPUT",
105            "KEYGEN",
106            "LINK",
107            "MENUITEM",
108            "META",
109            "PARAM",
110            "SOURCE",
111            "TRACK",
112            "WBR");
113    }
114
115    /** The error messages. */
116    private List<CmsMessageContainer> m_messages = new ArrayList<CmsMessageContainer>();
117
118    /** The number of root elements. */
119    private int m_rootElementCount;
120
121    /** The stack of opened HTML tags. */
122    private Stack<String> m_stack = new Stack<String>();
123
124    /** The number of unbalanced closed tags. */
125    private int m_unbalancedClosedTags;
126
127    /** The number of unbalanced opened tags. */
128    private int m_unbalancedOpenedTags;
129
130    /**
131     * Returns the validation error messages.<p>
132     *
133     * @return the error messages
134     */
135    public List<CmsMessageContainer> getMessages() {
136
137        return m_messages;
138    }
139
140    /**
141     * Returns the number of root elements.<p>
142     *
143     * @return the number of root elements
144     */
145    public int getRootElementCount() {
146
147        return m_rootElementCount;
148    }
149
150    /**
151     * Returns whether the validated HTML is balanced.<p>
152     *
153     * @return <code>true</code> in case the validated HTML is balanced
154     */
155    public boolean isBalanced() {
156
157        System.out.println(
158            "Unbalanced opened "
159                + m_unbalancedOpenedTags
160                + " tags, unbalanced closed "
161                + m_unbalancedClosedTags
162                + " tags.");
163        return (m_unbalancedOpenedTags == 0) && (m_unbalancedClosedTags == 0);
164    }
165
166    /**
167     * Validates the given HTML string.<p>
168     *
169     * @param html the HTML to validate
170     *
171     * @throws ParserException in case parsing fails
172     */
173    public void validate(String html) throws ParserException {
174
175        m_unbalancedClosedTags = 0;
176        m_unbalancedOpenedTags = 0;
177        m_rootElementCount = 0;
178        m_stack.clear();
179        m_messages.clear();
180        Parser parser = new Parser();
181        Lexer lexer = new Lexer();
182        Page page = new Page(html, "UTF-8");
183        lexer.setPage(page);
184        parser.setLexer(lexer);
185        // override built in composite tags to skip automatic tag closing
186        PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
187        lexer.setNodeFactory(factory);
188
189        parser.visitAllNodesWith(this);
190    }
191
192    /**
193     * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag)
194     */
195    @Override
196    public void visitEndTag(Tag tag) {
197
198        String tagName = tag.getTagName();
199        if (tagName.equals(m_stack.peek())) {
200            m_stack.pop();
201        } else {
202            if (m_stack.contains(tagName)) {
203                while (!tagName.equals(m_stack.peek())) {
204                    String enclosedTag = m_stack.pop();
205                    if (AUTOCLOSE_TAGS.contains(enclosedTag)) {
206                        System.out.println("Unbalanced void tag " + enclosedTag + ", will be ignored.");
207                    } else {
208                        System.out.println("Unbalanced opening tag: " + enclosedTag);
209                        m_messages.add(Messages.get().container(Messages.ERR_UNBALANCED_OPENING_TAG_1, enclosedTag));
210                        m_unbalancedOpenedTags++;
211                    }
212                }
213                m_stack.pop();
214            } else {
215                System.out.println("Unbalanced closing tag: " + tagName);
216                m_messages.add(Messages.get().container(Messages.ERR_UNBALANCED_CLOSING_TAG_1, tagName));
217                m_unbalancedClosedTags++;
218            }
219        }
220    }
221
222    /**
223     * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag)
224     */
225    @Override
226    public void visitTag(Tag tag) {
227
228        if (m_stack.isEmpty()) {
229            m_rootElementCount++;
230        }
231
232        if (!tag.isEmptyXmlTag()) {
233            m_stack.push(tag.getTagName());
234        }
235    }
236
237    /**
238     * Internally degrades Composite tags that do have children in the DOM tree
239     * to simple single tags. This allows to avoid auto correction of unclosed HTML tags.<p>
240     *
241     * @return A node factory that will not auto correct open tags
242     */
243    protected PrototypicalNodeFactory configureNoAutoCorrectionTags() {
244
245        PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
246
247        CmsNoAutoCloseTag noAutoCloseTag = new CmsNoAutoCloseTag(NO_AUTOCLOSE_TAGS);
248        factory.unregisterTag(noAutoCloseTag);
249        factory.registerTag(noAutoCloseTag);
250        return factory;
251    }
252}