001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030import org.opencms.i18n.CmsEncoder; 031import org.opencms.main.CmsLog; 032 033import java.io.UnsupportedEncodingException; 034import java.util.Iterator; 035import java.util.List; 036import java.util.Vector; 037 038import org.apache.commons.logging.Log; 039 040import org.htmlparser.Attribute; 041import org.htmlparser.Parser; 042import org.htmlparser.Tag; 043import org.htmlparser.lexer.Lexer; 044import org.htmlparser.lexer.Page; 045import org.htmlparser.util.ParserException; 046 047/** 048 * Simple html tag stripper that allows configuration of html tag names that are allowed. 049 * <p> 050 * 051 * All tags that are not explicitly allowed via invocation of one of the 052 * <code>addPreserve...</code> methods will be missing in the result of the method 053 * <code>{@link #stripHtml(String)}</code>.<p> 054 * 055 * Instances are reusable but not shareable (multithreading). If configuration should be changed 056 * between subsequent invocations of <code>{@link #stripHtml(String)}</code> method 057 * <code>{@link #reset()}</code> has to be called. 058 * <p> 059 * 060 * @since 6.9.2 061 * 062 */ 063public final class CmsHtmlStripper { 064 065 /** The log object for this class. */ 066 private static final Log LOG = CmsLog.getLog(CmsHtmlStripper.class); 067 068 /** A tag factory that is able to make tags invisible to visitors. */ 069 private CmsHtmlTagRemoveFactory m_nodeFactory; 070 071 /** Flag to control whether tidy is used. */ 072 private boolean m_useTidy; 073 074 /** 075 * Default constructor that turns echo on and uses the settings for replacing tags. 076 * <p> 077 */ 078 public CmsHtmlStripper() { 079 080 reset(); 081 } 082 083 /** 084 * Creates an instance with control whether tidy is used.<p> 085 * 086 * @param useTidy if true tidy will be used 087 */ 088 public CmsHtmlStripper(final boolean useTidy) { 089 090 this(); 091 m_useTidy = useTidy; 092 } 093 094 /** 095 * Adds a tag that will be preserved by <code>{@link #stripHtml(String)}</code>.<p> 096 * 097 * @param tagName the name of the tag to keep (case insensitive) 098 * 099 * @return true if the tagName was added correctly to the internal engine 100 */ 101 public boolean addPreserveTag(final String tagName) { 102 103 Vector<Attribute> attributeList = new Vector<Attribute>(1); 104 Attribute tagNameAttribute = new Attribute(); 105 tagNameAttribute.setName(tagName.toLowerCase()); 106 attributeList.add(tagNameAttribute); 107 Tag keepTag = m_nodeFactory.createTagNode(null, 0, 0, attributeList); 108 boolean result = m_nodeFactory.addTagPreserve(keepTag); 109 return result; 110 } 111 112 /** 113 * Convenience method for adding several tags to preserve.<p> 114 * 115 * @param preserveTags a <code>List<String></code> with the case-insensitive tag names of the tags to preserve 116 * 117 * @see #addPreserveTag(String) 118 */ 119 public void addPreserveTagList(List<String> preserveTags) { 120 121 for (Iterator<String> it = preserveTags.iterator(); it.hasNext();) { 122 addPreserveTag(it.next()); 123 } 124 } 125 126 /** 127 * Convenience method for adding several tags to preserve 128 * in form of a delimiter-separated String.<p> 129 * 130 * The String will be <code>{@link CmsStringUtil#splitAsList(String, char, boolean)}</code> 131 * with <code>tagList</code> as the first argument, <code>separator</code> as the 132 * second argument and the third argument set to true (trimming - support).<p> 133 * 134 * @param tagList a delimiter-separated String with case-insensitive tag names to preserve by 135 * <code>{@link #stripHtml(String)}</code> 136 * @param separator the delimiter that separates tag names in the <code>tagList</code> argument 137 * 138 * @see #addPreserveTag(String) 139 */ 140 public void addPreserveTags(final String tagList, final char separator) { 141 142 List<String> tags = CmsStringUtil.splitAsList(tagList, separator, true); 143 addPreserveTagList(tags); 144 } 145 146 /** 147 * Resets the configuration of the tags to preserve.<p> 148 * 149 * This is called from the constructor and only has to be called if this 150 * instance is reused with a differen configuration (of tags to keep).<p> 151 * 152 */ 153 public void reset() { 154 155 m_nodeFactory = new CmsHtmlTagRemoveFactory(); 156 } 157 158 /** 159 * Extracts the text from the given html content, assuming the given html encoding. 160 * <p> 161 * Additionally tags are replaced / removed according to the configuration of this instance. 162 * <p> 163 * 164 * <h3>Please note:</h3> 165 * There are static process methods in the superclass that will not do the replacements / 166 * removals. Don't mix them up with this method. 167 * <p> 168 * 169 * @param html the content to extract the plain text from. 170 * 171 * @return the text extracted from the given html content. 172 * 173 * @throws ParserException if something goes wrong. 174 */ 175 public String stripHtml(final String html) throws ParserException { 176 177 String content = html; 178 if (m_useTidy) { 179 content = tidy(content); 180 } 181 182 // initialize a parser with the given charset 183 Parser parser = new Parser(); 184 parser.setNodeFactory(m_nodeFactory); 185 Lexer lexer = new Lexer(); 186 Page page = new Page(content); 187 lexer.setPage(page); 188 parser.setLexer(lexer); 189 // process the page using a string collection wizard 190 // echo on 191 CmsHtmlParser visitor = new CmsHtmlParser(true); 192 parser.visitAllNodesWith(visitor); 193 // return the result 194 return visitor.getResult(); 195 } 196 197 /** 198 * Internally tidies with cleanup and XHTML.<p> 199 * 200 * @param content HTML to clean 201 * 202 * @return the tidy HTML 203 */ 204 private String tidy(final String content) { 205 206 CmsHtmlConverter converter = new CmsHtmlConverter( 207 CmsEncoder.ENCODING_UTF_8, 208 new StringBuffer(CmsHtmlConverter.PARAM_WORD).append(";").append(CmsHtmlConverter.PARAM_XHTML).toString()); 209 String result = content; 210 try { 211 result = converter.convertToString(content); 212 } catch (UnsupportedEncodingException e) { 213 // should never happen 214 if (LOG.isWarnEnabled()) { 215 LOG.warn(Messages.get().getBundle().key(Messages.LOG_WARN_TIDY_FAILURE_0), e); 216 } 217 } 218 return result; 219 } 220}