001 002package org.opencms.util; 003 004import java.util.HashMap; 005import java.util.Iterator; 006import java.util.List; 007import java.util.Map; 008 009import org.htmlparser.Tag; 010import org.htmlparser.Text; 011import org.htmlparser.util.Translate; 012 013/** 014 * Extracts the HTML page content.<p> 015 */ 016public class CmsHtml2TextConverter extends CmsHtmlParser { 017 018 /** Indicated to append or store the next line breaks. */ 019 private boolean m_appendBr; 020 021 /** Map of stored attributes that must be written to the output when the tag closes. */ 022 private Map<Tag, String> m_attributeMap; 023 024 /** The last appended line break count. */ 025 private int m_brCount; 026 027 /** The current indentation. */ 028 private int m_indent; 029 030 /** The current line length. */ 031 private int m_lineLength; 032 033 /** The marker String (for headlines, bullets etc.). */ 034 private String m_marker; 035 036 /** The maximum line length. */ 037 private int m_maxLineLength; 038 039 /** The last stored, but not appended line break count. */ 040 private int m_storedBrCount; 041 042 /** List of tags where to ignore the text. */ 043 private final List<String> IGNORE_TEXT = List.of("SCRIPT", "STYLE", "TEMPLATE"); 044 045 /** Flag indicating whether we are in a ignore text tag. */ 046 private boolean m_ignoreText; 047 048 /** 049 * Creates a new instance of the html converter.<p> 050 */ 051 public CmsHtml2TextConverter() { 052 053 m_result = new StringBuffer(512); 054 m_maxLineLength = 100; 055 m_attributeMap = new HashMap<Tag, String>(16); 056 } 057 058 /** 059 * Extracts the text from the given html content, assuming the given html encoding.<p> 060 * 061 * @param html the content to extract the plain text from 062 * @param encoding the encoding to use 063 * 064 * @return the text extracted from the given html content 065 * 066 * @throws Exception if something goes wrong 067 */ 068 public static String html2text(String html, String encoding) throws Exception { 069 070 // create the converter instance 071 CmsHtml2TextConverter visitor = new CmsHtml2TextConverter(); 072 return visitor.process(html, encoding); 073 } 074 075 /** 076 * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag) 077 */ 078 @Override 079 public void visitEndTag(Tag tag) { 080 081 m_appendBr = false; 082 m_ignoreText = false; 083 appendLinebreaks(tag, false); 084 String attribute = m_attributeMap.remove(tag.getParent()); 085 if (attribute != null) { 086 appendText(attribute); 087 } 088 } 089 090 /** 091 * @see org.htmlparser.visitors.NodeVisitor#visitStringNode(org.htmlparser.Text) 092 */ 093 @Override 094 public void visitStringNode(Text text) { 095 096 if (!m_ignoreText) { 097 appendText(text.toPlainTextString()); 098 } 099 m_ignoreText = false; 100 } 101 102 /** 103 * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag) 104 */ 105 @Override 106 public void visitTag(Tag tag) { 107 108 m_appendBr = true; 109 m_ignoreText = false; 110 appendLinebreaks(tag, true); 111 if (IGNORE_TEXT.contains(tag.getTagName())) { 112 m_ignoreText = true; 113 } 114 if (tag.getTagName().equals("IMG")) { 115 appendText("##IMG##"); 116 } 117 String href = tag.getAttribute("href"); 118 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(href)) { 119 appendAttribute(tag, " [" + href.trim() + "]"); 120 } 121 String src = tag.getAttribute("src"); 122 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(src)) { 123 appendAttribute(tag, " [" + src.trim() + "]"); 124 } 125 String title = tag.getAttribute("title"); 126 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(title)) { 127 appendAttribute(tag, " {" + title.trim() + "}"); 128 } 129 String alt = tag.getAttribute("alt"); 130 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(alt)) { 131 appendAttribute(tag, " {" + alt.trim() + "}"); 132 } 133 } 134 135 /** 136 * Appends an attribute.<p> 137 * 138 * @param tag the tag 139 * @param text the attribute text 140 */ 141 private void appendAttribute(Tag tag, String text) { 142 143 if (tag.getTagName().equals("IMG")) { 144 appendText(text); 145 } else { 146 String current = m_attributeMap.get(tag); 147 if (current != null) { 148 text = current + text; 149 } 150 m_attributeMap.put(tag, text); 151 } 152 } 153 154 /** 155 * Appends an indentation.<p> 156 */ 157 private void appendIndentation() { 158 159 if (m_lineLength <= m_indent) { 160 int len = (m_marker != null) ? m_indent - (m_marker.length() + 1) : m_indent; 161 for (int i = 0; i < len; i++) { 162 m_result.append(' '); 163 } 164 if (m_marker != null) { 165 m_result.append(m_marker); 166 m_result.append(' '); 167 m_marker = null; 168 } 169 } 170 } 171 172 /** 173 * Appends a line break.<p> 174 * 175 * @param count the number of lines 176 */ 177 private void appendLinebreak(int count) { 178 179 appendLinebreak(count, false); 180 } 181 182 /** 183 * Appends line breaks.<p> 184 * 185 * @param count the number of line breaks 186 * @param force if the line break should be forced 187 */ 188 private void appendLinebreak(int count, boolean force) { 189 190 if (m_appendBr) { 191 if (m_storedBrCount > count) { 192 count = m_storedBrCount; 193 } 194 m_storedBrCount = 0; 195 if (force) { 196 m_brCount = 0; 197 } 198 while (m_brCount < count) { 199 m_result.append("\r\n"); 200 m_brCount++; 201 } 202 m_lineLength = m_indent; 203 } else { 204 while (m_storedBrCount < count) { 205 m_storedBrCount++; 206 } 207 } 208 } 209 210 /** 211 * Appends line breaks.<p> 212 * 213 * @param tag the tag 214 * @param open the open flag 215 */ 216 private void appendLinebreaks(Tag tag, boolean open) { 217 218 String name = tag.getTagName(); 219 int pos = TAG_LIST.indexOf(name); 220 221 switch (pos) { 222 case 0: // H1 223 setMarker("=", open); 224 setIndentation(2, open); 225 appendLinebreak(2); 226 break; 227 case 1: // H2 228 setMarker("==", open); 229 setIndentation(3, open); 230 appendLinebreak(2); 231 break; 232 case 2: // H3 233 setMarker("===", open); 234 setIndentation(4, open); 235 appendLinebreak(2); 236 break; 237 case 3: // H4 238 setMarker("====", open); 239 setIndentation(5, open); 240 appendLinebreak(2); 241 break; 242 case 4: // H5 243 setMarker("=====", open); 244 setIndentation(6, open); 245 appendLinebreak(2); 246 break; 247 case 5: // H6 248 setMarker("=======", open); 249 setIndentation(7, open); 250 appendLinebreak(2); 251 break; 252 case 6: // P 253 case 7: // DIV 254 appendLinebreak(2); 255 break; 256 case 8: // SPAN 257 break; 258 case 9: // BR 259 appendLinebreak(1, true); 260 break; 261 case 10: // OL 262 case 11: // UL 263 appendLinebreak(2); 264 break; 265 case 12: // LI 266 setMarker("*", open); 267 setIndentation(5, open); 268 appendLinebreak(1); 269 break; 270 case 13: // TABLE 271 setIndentation(5, open); 272 appendLinebreak(2); 273 if (open) { 274 appendLinebreak(1); 275 appendText("-----"); 276 appendLinebreak(1); 277 } 278 break; 279 case 14: // TD 280 setMarker("--", open); 281 appendLinebreak(2); 282 break; 283 case 15: // TR 284 if (!open) { 285 appendLinebreak(1); 286 appendText("-----"); 287 appendLinebreak(1); 288 } 289 break; 290 case 16: // TH 291 case 17: // THEAD 292 case 18: // TBODY 293 case 19: // TFOOT 294 appendLinebreak(1); 295 break; 296 default: // unknown tag (ignore) 297 } 298 } 299 300 /** 301 * Appends text.<p> 302 * 303 * @param text the text 304 */ 305 private void appendText(String text) { 306 307 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) { 308 text = Translate.decode(text); 309 text = collapse(text); 310 } 311 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) { 312 313 if (m_storedBrCount > 0) { 314 m_appendBr = true; 315 appendLinebreak(m_storedBrCount); 316 } 317 appendIndentation(); 318 m_brCount = 0; 319 320 List<String> wordList = CmsStringUtil.splitAsList(text, ' '); 321 Iterator<String> i = wordList.iterator(); 322 while (i.hasNext()) { 323 String word = i.next(); 324 boolean hasNbsp = ((word.charAt(0) == 160) || (word.charAt(word.length() - 1) == 160)); 325 if ((word.length() + 1 + m_lineLength) > m_maxLineLength) { 326 m_appendBr = true; 327 appendLinebreak(1); 328 appendIndentation(); 329 m_brCount = 0; 330 } else { 331 if (!hasNbsp 332 && (m_lineLength > m_indent) 333 && (m_result.charAt(m_result.length() - 1) != 160) 334 && (m_result.charAt(m_result.length() - 1) != 32)) { 335 336 m_result.append(' '); 337 m_lineLength++; 338 } 339 } 340 m_result.append(word); 341 m_lineLength += word.length(); 342 } 343 } 344 } 345 346 /** 347 * Sets the indentation.<p> 348 * 349 * @param length the indentation length 350 * @param open if the indentation should be added or reduced 351 */ 352 private void setIndentation(int length, boolean open) { 353 354 if (open) { 355 m_indent += length; 356 } else { 357 m_indent -= length; 358 if (m_indent < 0) { 359 m_indent = 0; 360 } 361 } 362 } 363 364 /** 365 * Sets the marker.<p> 366 * 367 * @param marker the marker 368 * @param open if the marker should be added 369 */ 370 private void setMarker(String marker, boolean open) { 371 372 if (open) { 373 m_marker = marker; 374 } 375 } 376}