001
002package org.opencms.util;
003
004import java.util.HashMap;
005import java.util.Iterator;
006import java.util.List;
007import java.util.Map;
008
009import org.htmlparser.Tag;
010import org.htmlparser.Text;
011import org.htmlparser.util.Translate;
012
013/**
014 * Extracts the HTML page content.<p>
015 */
016public class CmsHtml2TextConverter extends CmsHtmlParser {
017
018    /** Indicated to append or store the next line breaks. */
019    private boolean m_appendBr;
020
021    /** Map of stored attributes that must be written to the output when the tag closes. */
022    private Map<Tag, String> m_attributeMap;
023
024    /** The last appended line break count. */
025    private int m_brCount;
026
027    /** The current indentation. */
028    private int m_indent;
029
030    /** The current line length. */
031    private int m_lineLength;
032
033    /** The marker String (for headlines, bullets etc.). */
034    private String m_marker;
035
036    /** The maximum line length. */
037    private int m_maxLineLength;
038
039    /** The last stored, but not appended line break count. */
040    private int m_storedBrCount;
041
042    /** List of tags where to ignore the text. */
043    private final List<String> IGNORE_TEXT = List.of("SCRIPT", "STYLE", "TEMPLATE");
044
045    /** Flag indicating whether we are in a ignore text tag. */
046    private boolean m_ignoreText;
047
048    /**
049     * Creates a new instance of the html converter.<p>
050     */
051    public CmsHtml2TextConverter() {
052
053        m_result = new StringBuffer(512);
054        m_maxLineLength = 100;
055        m_attributeMap = new HashMap<Tag, String>(16);
056    }
057
058    /**
059     * Extracts the text from the given html content, assuming the given html encoding.<p>
060     *
061     * @param html the content to extract the plain text from
062     * @param encoding the encoding to use
063     *
064     * @return the text extracted from the given html content
065     *
066     * @throws Exception if something goes wrong
067     */
068    public static String html2text(String html, String encoding) throws Exception {
069
070        // create the converter instance
071        CmsHtml2TextConverter visitor = new CmsHtml2TextConverter();
072        return visitor.process(html, encoding);
073    }
074
075    /**
076     * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag)
077     */
078    @Override
079    public void visitEndTag(Tag tag) {
080
081        m_appendBr = false;
082        m_ignoreText = false;
083        appendLinebreaks(tag, false);
084        String attribute = m_attributeMap.remove(tag.getParent());
085        if (attribute != null) {
086            appendText(attribute);
087        }
088    }
089
090    /**
091     * @see org.htmlparser.visitors.NodeVisitor#visitStringNode(org.htmlparser.Text)
092     */
093    @Override
094    public void visitStringNode(Text text) {
095
096        if (!m_ignoreText) {
097            appendText(text.toPlainTextString());
098        }
099        m_ignoreText = false;
100    }
101
102    /**
103     * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag)
104     */
105    @Override
106    public void visitTag(Tag tag) {
107
108        m_appendBr = true;
109        m_ignoreText = false;
110        appendLinebreaks(tag, true);
111        if (IGNORE_TEXT.contains(tag.getTagName())) {
112            m_ignoreText = true;
113        }
114        if (tag.getTagName().equals("IMG")) {
115            appendText("##IMG##");
116        }
117        String href = tag.getAttribute("href");
118        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(href)) {
119            appendAttribute(tag, " [" + href.trim() + "]");
120        }
121        String src = tag.getAttribute("src");
122        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(src)) {
123            appendAttribute(tag, " [" + src.trim() + "]");
124        }
125        String title = tag.getAttribute("title");
126        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(title)) {
127            appendAttribute(tag, " {" + title.trim() + "}");
128        }
129        String alt = tag.getAttribute("alt");
130        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(alt)) {
131            appendAttribute(tag, " {" + alt.trim() + "}");
132        }
133    }
134
135    /**
136     * Appends an attribute.<p>
137     *
138     * @param tag the tag
139     * @param text the attribute text
140     */
141    private void appendAttribute(Tag tag, String text) {
142
143        if (tag.getTagName().equals("IMG")) {
144            appendText(text);
145        } else {
146            String current = m_attributeMap.get(tag);
147            if (current != null) {
148                text = current + text;
149            }
150            m_attributeMap.put(tag, text);
151        }
152    }
153
154    /**
155     * Appends an indentation.<p>
156     */
157    private void appendIndentation() {
158
159        if (m_lineLength <= m_indent) {
160            int len = (m_marker != null) ? m_indent - (m_marker.length() + 1) : m_indent;
161            for (int i = 0; i < len; i++) {
162                m_result.append(' ');
163            }
164            if (m_marker != null) {
165                m_result.append(m_marker);
166                m_result.append(' ');
167                m_marker = null;
168            }
169        }
170    }
171
172    /**
173     * Appends a line break.<p>
174     *
175     * @param count the number of lines
176     */
177    private void appendLinebreak(int count) {
178
179        appendLinebreak(count, false);
180    }
181
182    /**
183     * Appends line breaks.<p>
184     *
185     * @param count the number of line breaks
186     * @param force if the line break should be forced
187     */
188    private void appendLinebreak(int count, boolean force) {
189
190        if (m_appendBr) {
191            if (m_storedBrCount > count) {
192                count = m_storedBrCount;
193            }
194            m_storedBrCount = 0;
195            if (force) {
196                m_brCount = 0;
197            }
198            while (m_brCount < count) {
199                m_result.append("\r\n");
200                m_brCount++;
201            }
202            m_lineLength = m_indent;
203        } else {
204            while (m_storedBrCount < count) {
205                m_storedBrCount++;
206            }
207        }
208    }
209
210    /**
211     * Appends line breaks.<p>
212     *
213     * @param tag the tag
214     * @param open the open flag
215     */
216    private void appendLinebreaks(Tag tag, boolean open) {
217
218        String name = tag.getTagName();
219        int pos = TAG_LIST.indexOf(name);
220
221        switch (pos) {
222            case 0: // H1
223                setMarker("=", open);
224                setIndentation(2, open);
225                appendLinebreak(2);
226                break;
227            case 1: // H2
228                setMarker("==", open);
229                setIndentation(3, open);
230                appendLinebreak(2);
231                break;
232            case 2: // H3
233                setMarker("===", open);
234                setIndentation(4, open);
235                appendLinebreak(2);
236                break;
237            case 3: // H4
238                setMarker("====", open);
239                setIndentation(5, open);
240                appendLinebreak(2);
241                break;
242            case 4: // H5
243                setMarker("=====", open);
244                setIndentation(6, open);
245                appendLinebreak(2);
246                break;
247            case 5: // H6
248                setMarker("=======", open);
249                setIndentation(7, open);
250                appendLinebreak(2);
251                break;
252            case 6: // P
253            case 7: // DIV
254                appendLinebreak(2);
255                break;
256            case 8: // SPAN
257                break;
258            case 9: // BR
259                appendLinebreak(1, true);
260                break;
261            case 10: // OL
262            case 11: // UL
263                appendLinebreak(2);
264                break;
265            case 12: // LI
266                setMarker("*", open);
267                setIndentation(5, open);
268                appendLinebreak(1);
269                break;
270            case 13: // TABLE
271                setIndentation(5, open);
272                appendLinebreak(2);
273                if (open) {
274                    appendLinebreak(1);
275                    appendText("-----");
276                    appendLinebreak(1);
277                }
278                break;
279            case 14: // TD
280                setMarker("--", open);
281                appendLinebreak(2);
282                break;
283            case 15: // TR
284                if (!open) {
285                    appendLinebreak(1);
286                    appendText("-----");
287                    appendLinebreak(1);
288                }
289                break;
290            case 16: // TH
291            case 17: // THEAD
292            case 18: // TBODY
293            case 19: // TFOOT
294                appendLinebreak(1);
295                break;
296            default: // unknown tag (ignore)
297        }
298    }
299
300    /**
301     * Appends text.<p>
302     *
303     * @param text the text
304     */
305    private void appendText(String text) {
306
307        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) {
308            text = Translate.decode(text);
309            text = collapse(text);
310        }
311        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) {
312
313            if (m_storedBrCount > 0) {
314                m_appendBr = true;
315                appendLinebreak(m_storedBrCount);
316            }
317            appendIndentation();
318            m_brCount = 0;
319
320            List<String> wordList = CmsStringUtil.splitAsList(text, ' ');
321            Iterator<String> i = wordList.iterator();
322            while (i.hasNext()) {
323                String word = i.next();
324                boolean hasNbsp = ((word.charAt(0) == 160) || (word.charAt(word.length() - 1) == 160));
325                if ((word.length() + 1 + m_lineLength) > m_maxLineLength) {
326                    m_appendBr = true;
327                    appendLinebreak(1);
328                    appendIndentation();
329                    m_brCount = 0;
330                } else {
331                    if (!hasNbsp
332                        && (m_lineLength > m_indent)
333                        && (m_result.charAt(m_result.length() - 1) != 160)
334                        && (m_result.charAt(m_result.length() - 1) != 32)) {
335
336                        m_result.append(' ');
337                        m_lineLength++;
338                    }
339                }
340                m_result.append(word);
341                m_lineLength += word.length();
342            }
343        }
344    }
345
346    /**
347     * Sets the indentation.<p>
348     *
349     * @param length the indentation length
350     * @param open if the indentation should be added or reduced
351     */
352    private void setIndentation(int length, boolean open) {
353
354        if (open) {
355            m_indent += length;
356        } else {
357            m_indent -= length;
358            if (m_indent < 0) {
359                m_indent = 0;
360            }
361        }
362    }
363
364    /**
365     * Sets the marker.<p>
366     *
367     * @param marker the marker
368     * @param open if the marker should be added
369     */
370    private void setMarker(String marker, boolean open) {
371
372        if (open) {
373            m_marker = marker;
374        }
375    }
376}