001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030import org.opencms.i18n.CmsMessageContainer; 031 032import java.util.ArrayList; 033import java.util.Collections; 034import java.util.HashSet; 035import java.util.List; 036import java.util.Set; 037import java.util.Stack; 038 039import org.htmlparser.Parser; 040import org.htmlparser.PrototypicalNodeFactory; 041import org.htmlparser.Tag; 042import org.htmlparser.lexer.Lexer; 043import org.htmlparser.lexer.Page; 044import org.htmlparser.util.ParserException; 045import org.htmlparser.visitors.NodeVisitor; 046 047/** 048 * Validates HTML.<p> 049 */ 050public class CmsHtmlValidator extends NodeVisitor { 051 052 /** Void HTML elements that do not need to be closed. */ 053 private static final Set<String> AUTOCLOSE_TAGS = new HashSet<String>(); 054 /** Tags to override the HTMLParser composite tags, to avoid automatic closing of unbalanced tags. */ 055 private static final String[] NO_AUTOCLOSE_TAGS = new String[] { 056 "APPLET", 057 "BLOCKQUOTE", 058 "BODY", 059 "LI", 060 "UL", 061 "OL", 062 "DL", 063 "DD", 064 "DT", 065 "DIV", 066 "FORM", 067 "FRAMESET", 068 "HTML", 069 "H1", 070 "H2", 071 "H3", 072 "H4", 073 "H5", 074 "H6", 075 "HEAD", 076 "LABEL", 077 "A", 078 "OBJECT", 079 "OPTION", 080 "P", 081 "SCRIPT", 082 "NOSCRIPT", 083 "SELECT", 084 "SPAN", 085 "STYLE", 086 "TD", 087 "TR", 088 "TBODY", 089 "TFOOT", 090 "THEAD", 091 "TEXTAREA", 092 "TITLE"}; 093 094 static { 095 Collections.addAll( 096 AUTOCLOSE_TAGS, 097 "AREA", 098 "BASE", 099 "BR", 100 "COL", 101 "EMBED", 102 "HR", 103 "IMG", 104 "INPUT", 105 "KEYGEN", 106 "LINK", 107 "MENUITEM", 108 "META", 109 "PARAM", 110 "SOURCE", 111 "TRACK", 112 "WBR"); 113 } 114 115 /** The error messages. */ 116 private List<CmsMessageContainer> m_messages = new ArrayList<CmsMessageContainer>(); 117 118 /** The number of root elements. */ 119 private int m_rootElementCount; 120 121 /** The stack of opened HTML tags. */ 122 private Stack<String> m_stack = new Stack<String>(); 123 124 /** The number of unbalanced closed tags. */ 125 private int m_unbalancedClosedTags; 126 127 /** The number of unbalanced opened tags. */ 128 private int m_unbalancedOpenedTags; 129 130 /** 131 * Returns the validation error messages.<p> 132 * 133 * @return the error messages 134 */ 135 public List<CmsMessageContainer> getMessages() { 136 137 return m_messages; 138 } 139 140 /** 141 * Returns the number of root elements.<p> 142 * 143 * @return the number of root elements 144 */ 145 public int getRootElementCount() { 146 147 return m_rootElementCount; 148 } 149 150 /** 151 * Returns whether the validated HTML is balanced.<p> 152 * 153 * @return <code>true</code> in case the validated HTML is balanced 154 */ 155 public boolean isBalanced() { 156 157 System.out.println( 158 "Unbalanced opened " 159 + m_unbalancedOpenedTags 160 + " tags, unbalanced closed " 161 + m_unbalancedClosedTags 162 + " tags."); 163 return (m_unbalancedOpenedTags == 0) && (m_unbalancedClosedTags == 0); 164 } 165 166 /** 167 * Validates the given HTML string.<p> 168 * 169 * @param html the HTML to validate 170 * 171 * @throws ParserException in case parsing fails 172 */ 173 public void validate(String html) throws ParserException { 174 175 m_unbalancedClosedTags = 0; 176 m_unbalancedOpenedTags = 0; 177 m_rootElementCount = 0; 178 m_stack.clear(); 179 m_messages.clear(); 180 Parser parser = new Parser(); 181 Lexer lexer = new Lexer(); 182 Page page = new Page(html, "UTF-8"); 183 lexer.setPage(page); 184 parser.setLexer(lexer); 185 // override built in composite tags to skip automatic tag closing 186 PrototypicalNodeFactory factory = configureNoAutoCorrectionTags(); 187 lexer.setNodeFactory(factory); 188 189 parser.visitAllNodesWith(this); 190 } 191 192 /** 193 * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag) 194 */ 195 @Override 196 public void visitEndTag(Tag tag) { 197 198 String tagName = tag.getTagName(); 199 if (tagName.equals(m_stack.peek())) { 200 m_stack.pop(); 201 } else { 202 if (m_stack.contains(tagName)) { 203 while (!tagName.equals(m_stack.peek())) { 204 String enclosedTag = m_stack.pop(); 205 if (AUTOCLOSE_TAGS.contains(enclosedTag)) { 206 System.out.println("Unbalanced void tag " + enclosedTag + ", will be ignored."); 207 } else { 208 System.out.println("Unbalanced opening tag: " + enclosedTag); 209 m_messages.add(Messages.get().container(Messages.ERR_UNBALANCED_OPENING_TAG_1, enclosedTag)); 210 m_unbalancedOpenedTags++; 211 } 212 } 213 m_stack.pop(); 214 } else { 215 System.out.println("Unbalanced closing tag: " + tagName); 216 m_messages.add(Messages.get().container(Messages.ERR_UNBALANCED_CLOSING_TAG_1, tagName)); 217 m_unbalancedClosedTags++; 218 } 219 } 220 } 221 222 /** 223 * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag) 224 */ 225 @Override 226 public void visitTag(Tag tag) { 227 228 if (m_stack.isEmpty()) { 229 m_rootElementCount++; 230 } 231 232 if (!tag.isEmptyXmlTag()) { 233 m_stack.push(tag.getTagName()); 234 } 235 } 236 237 /** 238 * Internally degrades Composite tags that do have children in the DOM tree 239 * to simple single tags. This allows to avoid auto correction of unclosed HTML tags.<p> 240 * 241 * @return A node factory that will not auto correct open tags 242 */ 243 protected PrototypicalNodeFactory configureNoAutoCorrectionTags() { 244 245 PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); 246 247 CmsNoAutoCloseTag noAutoCloseTag = new CmsNoAutoCloseTag(NO_AUTOCLOSE_TAGS); 248 factory.unregisterTag(noAutoCloseTag); 249 factory.registerTag(noAutoCloseTag); 250 return factory; 251 } 252}