001
002package org.opencms.util;
003
004import java.util.HashMap;
005import java.util.Iterator;
006import java.util.List;
007import java.util.Map;
008
009import org.htmlparser.Tag;
010import org.htmlparser.Text;
011import org.htmlparser.util.Translate;
012
013/**
014 * Extracts the HTML page content.<p>
015 */
016public class CmsHtml2TextConverter extends CmsHtmlParser {
017
018    /** Indicated to append or store the next line breaks. */
019    private boolean m_appendBr;
020
021    /** Map of stored attributes that must be written to the output when the tag closes. */
022    private Map<Tag, String> m_attributeMap;
023
024    /** The last appended line break count. */
025    private int m_brCount;
026
027    /** The current indentation. */
028    private int m_indent;
029
030    /** The current line length. */
031    private int m_lineLength;
032
033    /** The marker String (for headlines, bullets etc.). */
034    private String m_marker;
035
036    /** The maximum line length. */
037    private int m_maxLineLength;
038
039    /** The last stored, but not appended line break count. */
040    private int m_storedBrCount;
041
042    /**
043     * Creates a new instance of the html converter.<p>
044     */
045    public CmsHtml2TextConverter() {
046
047        m_result = new StringBuffer(512);
048        m_maxLineLength = 100;
049        m_attributeMap = new HashMap<Tag, String>(16);
050    }
051
052    /**
053     * Extracts the text from the given html content, assuming the given html encoding.<p>
054     *
055     * @param html the content to extract the plain text from
056     * @param encoding the encoding to use
057     *
058     * @return the text extracted from the given html content
059     *
060     * @throws Exception if something goes wrong
061     */
062    public static String html2text(String html, String encoding) throws Exception {
063
064        // create the converter instance
065        CmsHtml2TextConverter visitor = new CmsHtml2TextConverter();
066        return visitor.process(html, encoding);
067    }
068
069    /**
070     * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag)
071     */
072    @Override
073    public void visitEndTag(Tag tag) {
074
075        m_appendBr = false;
076        appendLinebreaks(tag, false);
077        String attribute = m_attributeMap.remove(tag.getParent());
078        if (attribute != null) {
079            appendText(attribute);
080        }
081    }
082
083    /**
084     * @see org.htmlparser.visitors.NodeVisitor#visitStringNode(org.htmlparser.Text)
085     */
086    @Override
087    public void visitStringNode(Text text) {
088
089        appendText(text.toPlainTextString());
090    }
091
092    /**
093     * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag)
094     */
095    @Override
096    public void visitTag(Tag tag) {
097
098        m_appendBr = true;
099        appendLinebreaks(tag, true);
100
101        if (tag.getTagName().equals("IMG")) {
102            appendText("##IMG##");
103        }
104
105        String href = tag.getAttribute("href");
106        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(href)) {
107            appendAttribute(tag, " [" + href.trim() + "]");
108        }
109        String src = tag.getAttribute("src");
110        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(src)) {
111            appendAttribute(tag, " [" + src.trim() + "]");
112        }
113        String title = tag.getAttribute("title");
114        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(title)) {
115            appendAttribute(tag, " {" + title.trim() + "}");
116        }
117        String alt = tag.getAttribute("alt");
118        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(alt)) {
119            appendAttribute(tag, " {" + alt.trim() + "}");
120        }
121    }
122
123    /**
124     * Appends an attribute.<p>
125     *
126     * @param tag the tag
127     * @param text the attribute text
128     */
129    private void appendAttribute(Tag tag, String text) {
130
131        if (tag.getTagName().equals("IMG")) {
132            appendText(text);
133        } else {
134            String current = m_attributeMap.get(tag);
135            if (current != null) {
136                text = current + text;
137            }
138            m_attributeMap.put(tag, text);
139        }
140    }
141
142    /**
143     * Appends an indentation.<p>
144     */
145    private void appendIndentation() {
146
147        if (m_lineLength <= m_indent) {
148            int len = (m_marker != null) ? m_indent - (m_marker.length() + 1) : m_indent;
149            for (int i = 0; i < len; i++) {
150                m_result.append(' ');
151            }
152            if (m_marker != null) {
153                m_result.append(m_marker);
154                m_result.append(' ');
155                m_marker = null;
156            }
157        }
158    }
159
160    /**
161     * Appends a line break.<p>
162     *
163     * @param count the number of lines
164     */
165    private void appendLinebreak(int count) {
166
167        appendLinebreak(count, false);
168    }
169
170    /**
171     * Appends line breaks.<p>
172     *
173     * @param count the number of line breaks
174     * @param force if the line break should be forced
175     */
176    private void appendLinebreak(int count, boolean force) {
177
178        if (m_appendBr) {
179            if (m_storedBrCount > count) {
180                count = m_storedBrCount;
181            }
182            m_storedBrCount = 0;
183            if (force) {
184                m_brCount = 0;
185            }
186            while (m_brCount < count) {
187                m_result.append("\r\n");
188                m_brCount++;
189            }
190            m_lineLength = m_indent;
191        } else {
192            while (m_storedBrCount < count) {
193                m_storedBrCount++;
194            }
195        }
196    }
197
198    /**
199     * Appends line breaks.<p>
200     *
201     * @param tag the tag
202     * @param open the open flag
203     */
204    private void appendLinebreaks(Tag tag, boolean open) {
205
206        String name = tag.getTagName();
207        int pos = TAG_LIST.indexOf(name);
208
209        switch (pos) {
210            case 0: // H1
211                setMarker("=", open);
212                setIndentation(2, open);
213                appendLinebreak(2);
214                break;
215            case 1: // H2
216                setMarker("==", open);
217                setIndentation(3, open);
218                appendLinebreak(2);
219                break;
220            case 2: // H3
221                setMarker("===", open);
222                setIndentation(4, open);
223                appendLinebreak(2);
224                break;
225            case 3: // H4
226                setMarker("====", open);
227                setIndentation(5, open);
228                appendLinebreak(2);
229                break;
230            case 4: // H5
231                setMarker("=====", open);
232                setIndentation(6, open);
233                appendLinebreak(2);
234                break;
235            case 5: // H6
236                setMarker("=======", open);
237                setIndentation(7, open);
238                appendLinebreak(2);
239                break;
240            case 6: // P
241            case 7: // DIV
242                appendLinebreak(2);
243                break;
244            case 8: // SPAN
245                break;
246            case 9: // BR
247                appendLinebreak(1, true);
248                break;
249            case 10: // OL
250            case 11: // UL
251                appendLinebreak(2);
252                break;
253            case 12: // LI
254                setMarker("*", open);
255                setIndentation(5, open);
256                appendLinebreak(1);
257                break;
258            case 13: // TABLE
259                setIndentation(5, open);
260                appendLinebreak(2);
261                if (open) {
262                    appendLinebreak(1);
263                    appendText("-----");
264                    appendLinebreak(1);
265                }
266                break;
267            case 14: // TD
268                setMarker("--", open);
269                appendLinebreak(2);
270                break;
271            case 15: // TR
272                if (!open) {
273                    appendLinebreak(1);
274                    appendText("-----");
275                    appendLinebreak(1);
276                }
277                break;
278            case 16: // TH
279            case 17: // THEAD
280            case 18: // TBODY
281            case 19: // TFOOT
282                appendLinebreak(1);
283                break;
284            default: // unknown tag (ignore)
285        }
286    }
287
288    /**
289     * Appends text.<p>
290     *
291     * @param text the text
292     */
293    private void appendText(String text) {
294
295        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) {
296            text = Translate.decode(text);
297            text = collapse(text);
298        }
299        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) {
300
301            if (m_storedBrCount > 0) {
302                m_appendBr = true;
303                appendLinebreak(m_storedBrCount);
304            }
305            appendIndentation();
306            m_brCount = 0;
307
308            List<String> wordList = CmsStringUtil.splitAsList(text, ' ');
309            Iterator<String> i = wordList.iterator();
310            while (i.hasNext()) {
311                String word = i.next();
312                boolean hasNbsp = ((word.charAt(0) == 160) || (word.charAt(word.length() - 1) == 160));
313                if ((word.length() + 1 + m_lineLength) > m_maxLineLength) {
314                    m_appendBr = true;
315                    appendLinebreak(1);
316                    appendIndentation();
317                    m_brCount = 0;
318                } else {
319                    if (!hasNbsp
320                        && (m_lineLength > m_indent)
321                        && (m_result.charAt(m_result.length() - 1) != 160)
322                        && (m_result.charAt(m_result.length() - 1) != 32)) {
323
324                        m_result.append(' ');
325                        m_lineLength++;
326                    }
327                }
328                m_result.append(word);
329                m_lineLength += word.length();
330            }
331        }
332    }
333
334    /**
335     * Sets the indentation.<p>
336     *
337     * @param length the indentation length
338     * @param open if the indentation should be added or reduced
339     */
340    private void setIndentation(int length, boolean open) {
341
342        if (open) {
343            m_indent += length;
344        } else {
345            m_indent -= length;
346            if (m_indent < 0) {
347                m_indent = 0;
348            }
349        }
350    }
351
352    /**
353     * Sets the marker.<p>
354     *
355     * @param marker the marker
356     * @param open if the marker should be added
357     */
358    private void setMarker(String marker, boolean open) {
359
360        if (open) {
361            m_marker = marker;
362        }
363    }
364}