001 002package org.opencms.util; 003 004import java.util.HashMap; 005import java.util.Iterator; 006import java.util.List; 007import java.util.Map; 008 009import org.htmlparser.Tag; 010import org.htmlparser.Text; 011import org.htmlparser.util.Translate; 012 013/** 014 * Extracts the HTML page content.<p> 015 */ 016public class CmsHtml2TextConverter extends CmsHtmlParser { 017 018 /** Indicated to append or store the next line breaks. */ 019 private boolean m_appendBr; 020 021 /** Map of stored attributes that must be written to the output when the tag closes. */ 022 private Map<Tag, String> m_attributeMap; 023 024 /** The last appended line break count. */ 025 private int m_brCount; 026 027 /** The current indentation. */ 028 private int m_indent; 029 030 /** The current line length. */ 031 private int m_lineLength; 032 033 /** The marker String (for headlines, bullets etc.). */ 034 private String m_marker; 035 036 /** The maximum line length. */ 037 private int m_maxLineLength; 038 039 /** The last stored, but not appended line break count. */ 040 private int m_storedBrCount; 041 042 /** 043 * Creates a new instance of the html converter.<p> 044 */ 045 public CmsHtml2TextConverter() { 046 047 m_result = new StringBuffer(512); 048 m_maxLineLength = 100; 049 m_attributeMap = new HashMap<Tag, String>(16); 050 } 051 052 /** 053 * Extracts the text from the given html content, assuming the given html encoding.<p> 054 * 055 * @param html the content to extract the plain text from 056 * @param encoding the encoding to use 057 * 058 * @return the text extracted from the given html content 059 * 060 * @throws Exception if something goes wrong 061 */ 062 public static String html2text(String html, String encoding) throws Exception { 063 064 // create the converter instance 065 CmsHtml2TextConverter visitor = new CmsHtml2TextConverter(); 066 return visitor.process(html, encoding); 067 } 068 069 /** 070 * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag) 071 */ 072 @Override 073 public void visitEndTag(Tag tag) { 074 075 m_appendBr = false; 076 appendLinebreaks(tag, false); 077 String attribute = m_attributeMap.remove(tag.getParent()); 078 if (attribute != null) { 079 appendText(attribute); 080 } 081 } 082 083 /** 084 * @see org.htmlparser.visitors.NodeVisitor#visitStringNode(org.htmlparser.Text) 085 */ 086 @Override 087 public void visitStringNode(Text text) { 088 089 appendText(text.toPlainTextString()); 090 } 091 092 /** 093 * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag) 094 */ 095 @Override 096 public void visitTag(Tag tag) { 097 098 m_appendBr = true; 099 appendLinebreaks(tag, true); 100 101 if (tag.getTagName().equals("IMG")) { 102 appendText("##IMG##"); 103 } 104 105 String href = tag.getAttribute("href"); 106 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(href)) { 107 appendAttribute(tag, " [" + href.trim() + "]"); 108 } 109 String src = tag.getAttribute("src"); 110 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(src)) { 111 appendAttribute(tag, " [" + src.trim() + "]"); 112 } 113 String title = tag.getAttribute("title"); 114 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(title)) { 115 appendAttribute(tag, " {" + title.trim() + "}"); 116 } 117 String alt = tag.getAttribute("alt"); 118 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(alt)) { 119 appendAttribute(tag, " {" + alt.trim() + "}"); 120 } 121 } 122 123 /** 124 * Appends an attribute.<p> 125 * 126 * @param tag the tag 127 * @param text the attribute text 128 */ 129 private void appendAttribute(Tag tag, String text) { 130 131 if (tag.getTagName().equals("IMG")) { 132 appendText(text); 133 } else { 134 String current = m_attributeMap.get(tag); 135 if (current != null) { 136 text = current + text; 137 } 138 m_attributeMap.put(tag, text); 139 } 140 } 141 142 /** 143 * Appends an indentation.<p> 144 */ 145 private void appendIndentation() { 146 147 if (m_lineLength <= m_indent) { 148 int len = (m_marker != null) ? m_indent - (m_marker.length() + 1) : m_indent; 149 for (int i = 0; i < len; i++) { 150 m_result.append(' '); 151 } 152 if (m_marker != null) { 153 m_result.append(m_marker); 154 m_result.append(' '); 155 m_marker = null; 156 } 157 } 158 } 159 160 /** 161 * Appends a line break.<p> 162 * 163 * @param count the number of lines 164 */ 165 private void appendLinebreak(int count) { 166 167 appendLinebreak(count, false); 168 } 169 170 /** 171 * Appends line breaks.<p> 172 * 173 * @param count the number of line breaks 174 * @param force if the line break should be forced 175 */ 176 private void appendLinebreak(int count, boolean force) { 177 178 if (m_appendBr) { 179 if (m_storedBrCount > count) { 180 count = m_storedBrCount; 181 } 182 m_storedBrCount = 0; 183 if (force) { 184 m_brCount = 0; 185 } 186 while (m_brCount < count) { 187 m_result.append("\r\n"); 188 m_brCount++; 189 } 190 m_lineLength = m_indent; 191 } else { 192 while (m_storedBrCount < count) { 193 m_storedBrCount++; 194 } 195 } 196 } 197 198 /** 199 * Appends line breaks.<p> 200 * 201 * @param tag the tag 202 * @param open the open flag 203 */ 204 private void appendLinebreaks(Tag tag, boolean open) { 205 206 String name = tag.getTagName(); 207 int pos = TAG_LIST.indexOf(name); 208 209 switch (pos) { 210 case 0: // H1 211 setMarker("=", open); 212 setIndentation(2, open); 213 appendLinebreak(2); 214 break; 215 case 1: // H2 216 setMarker("==", open); 217 setIndentation(3, open); 218 appendLinebreak(2); 219 break; 220 case 2: // H3 221 setMarker("===", open); 222 setIndentation(4, open); 223 appendLinebreak(2); 224 break; 225 case 3: // H4 226 setMarker("====", open); 227 setIndentation(5, open); 228 appendLinebreak(2); 229 break; 230 case 4: // H5 231 setMarker("=====", open); 232 setIndentation(6, open); 233 appendLinebreak(2); 234 break; 235 case 5: // H6 236 setMarker("=======", open); 237 setIndentation(7, open); 238 appendLinebreak(2); 239 break; 240 case 6: // P 241 case 7: // DIV 242 appendLinebreak(2); 243 break; 244 case 8: // SPAN 245 break; 246 case 9: // BR 247 appendLinebreak(1, true); 248 break; 249 case 10: // OL 250 case 11: // UL 251 appendLinebreak(2); 252 break; 253 case 12: // LI 254 setMarker("*", open); 255 setIndentation(5, open); 256 appendLinebreak(1); 257 break; 258 case 13: // TABLE 259 setIndentation(5, open); 260 appendLinebreak(2); 261 if (open) { 262 appendLinebreak(1); 263 appendText("-----"); 264 appendLinebreak(1); 265 } 266 break; 267 case 14: // TD 268 setMarker("--", open); 269 appendLinebreak(2); 270 break; 271 case 15: // TR 272 if (!open) { 273 appendLinebreak(1); 274 appendText("-----"); 275 appendLinebreak(1); 276 } 277 break; 278 case 16: // TH 279 case 17: // THEAD 280 case 18: // TBODY 281 case 19: // TFOOT 282 appendLinebreak(1); 283 break; 284 default: // unknown tag (ignore) 285 } 286 } 287 288 /** 289 * Appends text.<p> 290 * 291 * @param text the text 292 */ 293 private void appendText(String text) { 294 295 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) { 296 text = Translate.decode(text); 297 text = collapse(text); 298 } 299 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) { 300 301 if (m_storedBrCount > 0) { 302 m_appendBr = true; 303 appendLinebreak(m_storedBrCount); 304 } 305 appendIndentation(); 306 m_brCount = 0; 307 308 List<String> wordList = CmsStringUtil.splitAsList(text, ' '); 309 Iterator<String> i = wordList.iterator(); 310 while (i.hasNext()) { 311 String word = i.next(); 312 boolean hasNbsp = ((word.charAt(0) == 160) || (word.charAt(word.length() - 1) == 160)); 313 if ((word.length() + 1 + m_lineLength) > m_maxLineLength) { 314 m_appendBr = true; 315 appendLinebreak(1); 316 appendIndentation(); 317 m_brCount = 0; 318 } else { 319 if (!hasNbsp 320 && (m_lineLength > m_indent) 321 && (m_result.charAt(m_result.length() - 1) != 160) 322 && (m_result.charAt(m_result.length() - 1) != 32)) { 323 324 m_result.append(' '); 325 m_lineLength++; 326 } 327 } 328 m_result.append(word); 329 m_lineLength += word.length(); 330 } 331 } 332 } 333 334 /** 335 * Sets the indentation.<p> 336 * 337 * @param length the indentation length 338 * @param open if the indentation should be added or reduced 339 */ 340 private void setIndentation(int length, boolean open) { 341 342 if (open) { 343 m_indent += length; 344 } else { 345 m_indent -= length; 346 if (m_indent < 0) { 347 m_indent = 0; 348 } 349 } 350 } 351 352 /** 353 * Sets the marker.<p> 354 * 355 * @param marker the marker 356 * @param open if the marker should be added 357 */ 358 private void setMarker(String marker, boolean open) { 359 360 if (open) { 361 m_marker = marker; 362 } 363 } 364}