001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.util;
029
030import org.opencms.i18n.CmsEncoder;
031import org.opencms.main.CmsLog;
032
033import java.io.UnsupportedEncodingException;
034import java.util.Iterator;
035import java.util.List;
036import java.util.Vector;
037
038import org.apache.commons.logging.Log;
039
040import org.htmlparser.Attribute;
041import org.htmlparser.Parser;
042import org.htmlparser.Tag;
043import org.htmlparser.lexer.Lexer;
044import org.htmlparser.lexer.Page;
045import org.htmlparser.util.ParserException;
046
047/**
048 * Simple html tag stripper that allows configuration of html tag names that are allowed.
049 * <p>
050 *
051 * All tags that are not explicitly allowed via invocation of one of the
052 * <code>addPreserve...</code> methods will be missing in the result of the method
053 * <code>{@link #stripHtml(String)}</code>.<p>
054 *
055 * Instances are reusable but not shareable (multithreading). If configuration should be changed
056 * between subsequent invocations of <code>{@link #stripHtml(String)}</code> method
057 * <code>{@link #reset()}</code> has to be called.
058 * <p>
059 *
060 * @since 6.9.2
061 *
062 */
063public final class CmsHtmlStripper {
064
065    /** The log object for this class. */
066    private static final Log LOG = CmsLog.getLog(CmsHtmlStripper.class);
067
068    /** A tag factory that is able to make tags invisible to visitors. */
069    private CmsHtmlTagRemoveFactory m_nodeFactory;
070
071    /** Flag to control whether tidy is used. */
072    private boolean m_useTidy;
073
074    /**
075     * Default constructor that turns echo on and uses the settings for replacing tags.
076     * <p>
077     */
078    public CmsHtmlStripper() {
079
080        reset();
081    }
082
083    /**
084     * Creates an instance with control whether tidy is used.<p>
085     *
086     * @param useTidy if true tidy will be used
087     */
088    public CmsHtmlStripper(final boolean useTidy) {
089
090        this();
091        m_useTidy = useTidy;
092    }
093
094    /**
095     * Adds a tag that will be preserved by <code>{@link #stripHtml(String)}</code>.<p>
096     *
097     * @param tagName the name of the tag to keep (case insensitive)
098     *
099     * @return true if the tagName was added correctly to the internal engine
100     */
101    public boolean addPreserveTag(final String tagName) {
102
103        Vector<Attribute> attributeList = new Vector<Attribute>(1);
104        Attribute tagNameAttribute = new Attribute();
105        tagNameAttribute.setName(tagName.toLowerCase());
106        attributeList.add(tagNameAttribute);
107        Tag keepTag = m_nodeFactory.createTagNode(null, 0, 0, attributeList);
108        boolean result = m_nodeFactory.addTagPreserve(keepTag);
109        return result;
110    }
111
112    /**
113     * Convenience method for adding several tags to preserve.<p>
114     *
115     * @param preserveTags a <code>List&lt;String&gt;</code> with the case-insensitive tag names of the tags to preserve
116     *
117     * @see #addPreserveTag(String)
118     */
119    public void addPreserveTagList(List<String> preserveTags) {
120
121        for (Iterator<String> it = preserveTags.iterator(); it.hasNext();) {
122            addPreserveTag(it.next());
123        }
124    }
125
126    /**
127     * Convenience method for adding several tags to preserve
128     * in form of a delimiter-separated String.<p>
129     *
130     * The String will be <code>{@link CmsStringUtil#splitAsList(String, char, boolean)}</code>
131     * with <code>tagList</code> as the first argument, <code>separator</code> as the
132     * second argument and the third argument set to true (trimming - support).<p>
133     *
134     * @param tagList a delimiter-separated String with case-insensitive tag names to preserve by
135     *      <code>{@link #stripHtml(String)}</code>
136     * @param separator the delimiter that separates tag names in the <code>tagList</code> argument
137     *
138     * @see #addPreserveTag(String)
139     */
140    public void addPreserveTags(final String tagList, final char separator) {
141
142        List<String> tags = CmsStringUtil.splitAsList(tagList, separator, true);
143        addPreserveTagList(tags);
144    }
145
146    /**
147     * Resets the configuration of the tags to preserve.<p>
148     *
149     * This is called from the constructor and only has to be called if this
150     * instance is reused with a differen configuration (of tags to keep).<p>
151     *
152     */
153    public void reset() {
154
155        m_nodeFactory = new CmsHtmlTagRemoveFactory();
156    }
157
158    /**
159     * Extracts the text from the given html content, assuming the given html encoding.
160     * <p>
161     * Additionally tags are replaced / removed according to the configuration of this instance.
162     * <p>
163     *
164     * <h3>Please note:</h3>
165     * There are static process methods in the superclass that will not do the replacements /
166     * removals. Don't mix them up with this method.
167     * <p>
168     *
169     * @param html the content to extract the plain text from.
170     *
171     * @return the text extracted from the given html content.
172     *
173     * @throws ParserException if something goes wrong.
174     */
175    public String stripHtml(final String html) throws ParserException {
176
177        String content = html;
178        if (m_useTidy) {
179            content = tidy(content);
180        }
181
182        // initialize a parser with the given charset
183        Parser parser = new Parser();
184        parser.setNodeFactory(m_nodeFactory);
185        Lexer lexer = new Lexer();
186        Page page = new Page(content);
187        lexer.setPage(page);
188        parser.setLexer(lexer);
189        // process the page using a string collection wizard
190        // echo on
191        CmsHtmlParser visitor = new CmsHtmlParser(true);
192        parser.visitAllNodesWith(visitor);
193        // return the result
194        return visitor.getResult();
195    }
196
197    /**
198     * Internally tidies with cleanup and XHTML.<p>
199     *
200     * @param content HTML to clean
201     *
202     * @return the tidy HTML
203     */
204    private String tidy(final String content) {
205
206        CmsHtmlConverter converter = new CmsHtmlConverter(
207            CmsEncoder.ENCODING_UTF_8,
208            new StringBuffer(CmsHtmlConverter.PARAM_WORD).append(";").append(CmsHtmlConverter.PARAM_XHTML).toString());
209        String result = content;
210        try {
211            result = converter.convertToString(content);
212        } catch (UnsupportedEncodingException e) {
213            // should never happen
214            if (LOG.isWarnEnabled()) {
215                LOG.warn(Messages.get().getBundle().key(Messages.LOG_WARN_TIDY_FAILURE_0), e);
216            }
217        }
218        return result;
219    }
220}