001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030import java.util.List; 031 032import org.htmlparser.Remark; 033import org.htmlparser.Tag; 034import org.htmlparser.Text; 035import org.htmlparser.util.ParserException; 036 037/** 038 * 039 * Interface for a combination of a visitor of HTML documents along with the hook to start the 040 * parser / lexer that triggers the visit. 041 * <p> 042 * 043 * 044 * 045 * @since 6.1.3 046 * 047 */ 048public interface I_CmsHtmlNodeVisitor { 049 050 /** 051 * Returns the configuartion String of this visitor or the empty String if was not provided 052 * before. 053 * <p> 054 * 055 * @return the configuartion String of this visitor - by this contract never null but an empty 056 * String if not provided. 057 * 058 * @see #setConfiguration(String) 059 */ 060 String getConfiguration(); 061 062 /** 063 * Returns the text extraction result. 064 * <p> 065 * 066 * @return the text extraction result 067 */ 068 String getResult(); 069 070 /** 071 * Extracts the text from the given html content, assuming the given html encoding. 072 * <p> 073 * 074 * @param html the content to extract the plain text from 075 * @param encoding the encoding to use 076 * 077 * @return the text extracted from the given html content 078 * 079 * @throws ParserException if something goes wrong 080 */ 081 String process(String html, String encoding) throws ParserException; 082 083 /** 084 * Set a configuartion String for this visitor. 085 * <p> 086 * 087 * This will most likely be done with data from an xsd, custom jsp tag, ... 088 * <p> 089 * 090 * @param configuration the configuration of this visitor to set. 091 */ 092 void setConfiguration(String configuration); 093 094 /** 095 * Sets a list of upper case tag names for which parsing / visitng should not correct missing closing tags.<p> 096 * 097 * This has to be used before <code>{@link #process(String, String)}</code> is invoked to take an effect.<p> 098 * 099 * @param noAutoCloseTags a list of upper case tag names for which parsing / visiting 100 * should not correct missing closing tags to set. 101 */ 102 void setNoAutoCloseTags(List<String> noAutoCloseTags); 103 104 /** 105 * Visitor method (callback) invoked when a closing Tag is encountered. 106 * <p> 107 * 108 * @param tag the tag that is ended. 109 * 110 * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag) 111 */ 112 void visitEndTag(Tag tag); 113 114 /** 115 * Visitor method (callback) invoked when a remark Tag (HTML comment) is encountered. 116 * <p> 117 * 118 * @param remark the remark Tag to visit. 119 * 120 * @see org.htmlparser.visitors.NodeVisitor#visitRemarkNode(org.htmlparser.Remark) 121 */ 122 void visitRemarkNode(Remark remark); 123 124 /** 125 * 126 * Visitor method (callback) invoked when a remark Tag (HTML comment) is encountered. 127 * <p> 128 * 129 * @param text the text that is visited. 130 * 131 * @see org.htmlparser.visitors.NodeVisitor#visitStringNode(org.htmlparser.Text) 132 */ 133 void visitStringNode(Text text); 134 135 /** 136 * Visitor method (callback) invoked when a starting Tag (HTML comment) is encountered. 137 * <p> 138 * 139 * @param tag the tag that is visited. 140 * 141 * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag) 142 */ 143 void visitTag(Tag tag); 144 145}