001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030import java.util.ArrayList; 031import java.util.Arrays; 032import java.util.Iterator; 033import java.util.List; 034 035import org.htmlparser.Parser; 036import org.htmlparser.PrototypicalNodeFactory; 037import org.htmlparser.Remark; 038import org.htmlparser.Tag; 039import org.htmlparser.Text; 040import org.htmlparser.lexer.Lexer; 041import org.htmlparser.lexer.Page; 042import org.htmlparser.util.ParserException; 043import org.htmlparser.visitors.NodeVisitor; 044 045/** 046 * Base utility class for OpenCms <code>{@link org.htmlparser.visitors.NodeVisitor}</code> 047 * implementations, which provides some often used utility functions. 048 * <p> 049 * 050 * This base implementation is only a "pass through" class, that is the content is parsed, but the 051 * generated result is exactly identical to the input. 052 * <p> 053 * 054 * @since 6.2.0 055 */ 056public class CmsHtmlParser extends NodeVisitor implements I_CmsHtmlNodeVisitor { 057 058 /** List of upper case tag name strings of tags that should not be auto-corrected if closing divs are missing. */ 059 protected List<String> m_noAutoCloseTags; 060 061 /** The array of supported tag names. */ 062 // important: don't change the order of these tags in the source, subclasses may expect the tags 063 // at the exact indices give here 064 // if you want to add tags, add them at the end 065 protected static final String[] TAG_ARRAY = new String[] { 066 "H1", 067 "H2", 068 "H3", 069 "H4", 070 "H5", 071 "H6", 072 "P", 073 "DIV", 074 "SPAN", 075 "BR", 076 "OL", 077 "UL", 078 "LI", 079 "TABLE", 080 "TD", 081 "TR", 082 "TH", 083 "THEAD", 084 "TBODY", 085 "TFOOT"}; 086 087 /** The list of supported tag names. */ 088 protected static final List<String> TAG_LIST = Arrays.asList(TAG_ARRAY); 089 090 /** Indicates if "echo" mode is on, that is all content is written to the result by default. */ 091 protected boolean m_echo; 092 093 /** The buffer to write the out to. */ 094 protected StringBuffer m_result; 095 096 /** The providable configuration - never null by contract of interface. */ 097 private String m_configuration = ""; 098 099 /** 100 * Creates a new instance of the html converter with echo mode set to <code>false</code>. 101 * <p> 102 */ 103 public CmsHtmlParser() { 104 105 this(false); 106 } 107 108 /** 109 * Creates a new instance of the html converter. 110 * <p> 111 * 112 * @param echo indicates if "echo" mode is on, that is all content is written to the result 113 */ 114 public CmsHtmlParser(boolean echo) { 115 116 m_result = new StringBuffer(1024); 117 m_echo = echo; 118 m_noAutoCloseTags = new ArrayList<String>(32); 119 } 120 121 /** 122 * Internally degrades Composite tags that do have children in the DOM tree 123 * to simple single tags. This allows to avoid auto correction of unclosed HTML tags.<p> 124 * 125 * @return A node factory that will not autocorrect open tags specified via <code>{@link #setNoAutoCloseTags(List)}</code> 126 */ 127 protected PrototypicalNodeFactory configureNoAutoCorrectionTags() { 128 129 PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); 130 131 String tagName; 132 Iterator<String> it = m_noAutoCloseTags.iterator(); 133 CmsNoAutoCloseTag noAutoCloseTag; 134 while (it.hasNext()) { 135 tagName = it.next(); 136 noAutoCloseTag = new CmsNoAutoCloseTag(new String[] {tagName}); 137 // TODO: This might break in case registering / unregistering will change from name based to tag-type based approach: 138 factory.unregisterTag(noAutoCloseTag); 139 factory.registerTag(noAutoCloseTag); 140 } 141 return factory; 142 } 143 144 /** 145 * @see org.opencms.util.I_CmsHtmlNodeVisitor#getConfiguration() 146 */ 147 public String getConfiguration() { 148 149 return m_configuration; 150 } 151 152 /** 153 * @see org.opencms.util.I_CmsHtmlNodeVisitor#getResult() 154 */ 155 public String getResult() { 156 157 return m_result.toString(); 158 } 159 160 /** 161 * Returns the HTML for the given tag itself (not the tag content). 162 * <p> 163 * 164 * @param tag the tag to create the HTML for 165 * 166 * @return the HTML for the given tag 167 */ 168 public String getTagHtml(Tag tag) { 169 170 StringBuffer result = new StringBuffer(32); 171 result.append('<'); 172 result.append(tag.getText()); 173 result.append('>'); 174 return result.toString(); 175 } 176 177 /** 178 * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String) 179 */ 180 public String process(String html, String encoding) throws ParserException { 181 182 m_result = new StringBuffer(); 183 Parser parser = new Parser(); 184 Lexer lexer = new Lexer(); 185 186 // initialize the page with the given char set 187 Page page = new Page(html, encoding); 188 lexer.setPage(page); 189 parser.setLexer(lexer); 190 191 if ((m_noAutoCloseTags != null) && (m_noAutoCloseTags.size() > 0)) { 192 // Degrade Composite tags that do have children in the DOM tree 193 // to simple single tags: This allows to finish this tag with opened HTML tags without the effect 194 // that html parser will generate the closing tags. 195 PrototypicalNodeFactory factory = configureNoAutoCorrectionTags(); 196 lexer.setNodeFactory(factory); 197 } 198 199 // process the page using the given visitor 200 parser.visitAllNodesWith(this); 201 // return the result 202 return getResult(); 203 } 204 205 /** 206 * 207 * @see org.opencms.util.I_CmsHtmlNodeVisitor#setConfiguration(java.lang.String) 208 */ 209 public void setConfiguration(String configuration) { 210 211 if (CmsStringUtil.isNotEmpty(configuration)) { 212 m_configuration = configuration; 213 } 214 215 } 216 217 /** 218 * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitEndTag(org.htmlparser.Tag) 219 */ 220 @Override 221 public void visitEndTag(Tag tag) { 222 223 if (m_echo) { 224 m_result.append(getTagHtml(tag)); 225 } 226 } 227 228 /** 229 * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitRemarkNode(org.htmlparser.Remark) 230 */ 231 @Override 232 public void visitRemarkNode(Remark remark) { 233 234 if (m_echo) { 235 m_result.append(remark.toHtml(true)); 236 } 237 } 238 239 /** 240 * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitStringNode(org.htmlparser.Text) 241 */ 242 @Override 243 public void visitStringNode(Text text) { 244 245 if (m_echo) { 246 m_result.append(text.getText()); 247 } 248 } 249 250 /** 251 * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitTag(org.htmlparser.Tag) 252 */ 253 @Override 254 public void visitTag(Tag tag) { 255 256 if (m_echo) { 257 m_result.append(getTagHtml(tag)); 258 } 259 } 260 261 /** 262 * Collapse HTML whitespace in the given String.<p> 263 * 264 * @param string the string to collapse 265 * 266 * @return the input String with all HTML whitespace collapsed 267 */ 268 protected String collapse(String string) { 269 270 int len = string.length(); 271 StringBuffer result = new StringBuffer(len); 272 int state = 0; 273 for (int i = 0; i < len; i++) { 274 char c = string.charAt(i); 275 switch (c) { 276 // see HTML specification section 9.1 White space 277 // http://www.w3.org/TR/html4/struct/text.html#h-9.1 278 case '\u0020': 279 case '\u0009': 280 case '\u000C': 281 case '\u200B': 282 case '\r': 283 case '\n': 284 if (0 != state) { 285 state = 1; 286 } 287 break; 288 default: 289 if (1 == state) { 290 result.append(' '); 291 } 292 state = 2; 293 result.append(c); 294 } 295 } 296 return result.toString(); 297 } 298 299 /** 300 * Returns a list of upper case tag names for which parsing / visiting will not correct missing closing tags.<p> 301 * 302 * @return a List of upper case tag names for which parsing / visiting will not correct missing closing tags 303 */ 304 public List<String> getNoAutoCloseTags() { 305 306 return m_noAutoCloseTags; 307 } 308 309 /** 310 * Sets a list of upper case tag names for which parsing / visiting should not correct missing closing tags.<p> 311 * 312 * @param noAutoCloseTagList a list of upper case tag names for which parsing / visiting 313 * should not correct missing closing tags to set. 314 */ 315 public void setNoAutoCloseTags(List<String> noAutoCloseTagList) { 316 317 // ensuring upper case 318 m_noAutoCloseTags.clear(); 319 if (noAutoCloseTagList != null) { 320 Iterator<String> it = noAutoCloseTagList.iterator(); 321 while (it.hasNext()) { 322 m_noAutoCloseTags.add((it.next()).toUpperCase()); 323 } 324 } 325 } 326}