001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.workplace.tools.database; 029 030import org.opencms.file.CmsPropertyDefinition; 031import org.opencms.i18n.CmsEncoder; 032import org.opencms.main.CmsLog; 033import org.opencms.util.CmsStringUtil; 034 035import java.io.ByteArrayInputStream; 036import java.io.ByteArrayOutputStream; 037import java.io.IOException; 038import java.io.InputStream; 039import java.io.PrintWriter; 040import java.io.Reader; 041import java.io.StringReader; 042import java.io.StringWriter; 043import java.io.UnsupportedEncodingException; 044import java.io.Writer; 045import java.util.HashSet; 046import java.util.Hashtable; 047import java.util.StringTokenizer; 048import java.util.regex.Matcher; 049import java.util.regex.Pattern; 050 051import org.w3c.dom.Document; 052import org.w3c.dom.NamedNodeMap; 053import org.w3c.dom.Node; 054import org.w3c.dom.NodeList; 055import org.w3c.tidy.Tidy; 056 057/** 058 * This class implements Html-converting routines based on tidy to modify the 059 * Html code of the imported Html pages.<p> 060 * 061 * @since 6.0.0 062 */ 063public class CmsHtmlImportConverter { 064 065 /** defintition of the alt attribute. */ 066 private static final String ATTRIB_ALT = "alt"; 067 068 /** defintition of the content attribute. */ 069 private static final String ATTRIB_CONTENT = "content"; 070 071 /** defintition of the href attribute. */ 072 private static final String ATTRIB_HREF = "href"; 073 074 /** defintition of the name attribute. */ 075 private static final String ATTRIB_NAME = "name"; 076 077 /** defintition of the src attribute. */ 078 private static final String ATTRIB_SRC = "src"; 079 080 /** defintition of the <BODY></BODY> node. */ 081 private static final String NODE_BODY = "body"; 082 083 /** defintition of the <HEAD></HEAD> node. */ 084 private static final String NODE_HEAD = "head"; 085 086 /** defintition of the <A></A> node. */ 087 private static final String NODE_HREF = "a"; 088 089 /** defintition of the <HTML></HTML> node. */ 090 private static final String NODE_HTML = "html"; 091 092 /** defintition of the <IMG></IMG> node. */ 093 private static final String NODE_IMG = "img"; 094 095 /** defintition of the <META></META> node. */ 096 private static final String NODE_META = "meta"; 097 098 /** defintition of the <TITLE></TITLE> node. */ 099 private static final String NODE_TITLE = "title"; 100 101 /** 102 * HashMap stores tag names, after the end-tag, a "\n" is added to the output.<p> 103 */ 104 private HashSet m_enterTags = new HashSet(); 105 106 /** 107 * the absolute path in the real filesystem of the file to convert. 108 */ 109 private String m_filename; 110 111 /** 112 * reference to the HtmlImport object, required to access the link translation. 113 */ 114 private CmsHtmlImport m_htmlImport; 115 116 /** 117 * temporary buffer used in transformation method. 118 */ 119 private StringBuffer m_tempString; 120 121 /** instance of JTidy. */ 122 private Tidy m_tidy = new Tidy(); 123 124 /** flag to write the output. */ 125 private boolean m_write; 126 127 /** 128 * Default constructor, creates a new HtmlConverter.<p> 129 * 130 * @param htmlImport reference to the htmlimport 131 * @param xmlMode switch for setting the import to HTML or XML mode 132 */ 133 public CmsHtmlImportConverter(CmsHtmlImport htmlImport, boolean xmlMode) { 134 135 m_tidy.setTidyMark(false); 136 m_tidy.setShowWarnings(false); 137 m_tidy.setQuiet(true); 138 m_tidy.setForceOutput(true); 139 140 if (xmlMode) { 141 m_tidy.setXmlTags(xmlMode); 142 m_tidy.setXmlSpace(true); 143 } 144 145 initialiseTags(); 146 m_htmlImport = htmlImport; 147 } 148 149 /** 150 * Extracts the content of a HTML page.<p> 151 * 152 * This method should be pretty robust and work even if the input HTML does not contains 153 * the specified matchers.<p> 154 * 155 * @param content the content to extract the body from 156 * @param startpoint the point where matching starts 157 * @param endpoint the point where matching ends 158 * @return the extracted body tag content 159 */ 160 public static String extractHtml(String content, String startpoint, String endpoint) { 161 162 /** Regex that matches a start body tag. */ 163 Pattern startPattern = Pattern.compile(startpoint, Pattern.CASE_INSENSITIVE); 164 165 /** Regex that matches an end body tag. */ 166 Pattern endPattern = Pattern.compile(endpoint, Pattern.CASE_INSENSITIVE); 167 168 Matcher startMatcher = startPattern.matcher(content); 169 Matcher endMatcher = endPattern.matcher(content); 170 171 int start = 0; 172 int end = content.length(); 173 174 if (startMatcher.find()) { 175 start = startMatcher.end(); 176 } 177 178 if (endMatcher.find(start)) { 179 end = endMatcher.start(); 180 } 181 182 return content.substring(start, end); 183 } 184 185 /** 186 * Transforms HTML code into user defined output.<p> 187 * 188 * @param input Reader with HTML code 189 * @param output Writer with transformed code 190 * @param startPattern the start pattern definition for content extracting 191 * @param endPattern the end pattern definition for content extracting 192 * @param properties the file properties 193 */ 194 public void convertHTML(Reader input, Writer output, String startPattern, String endPattern, Hashtable properties) { 195 196 /* local variables */ 197 StringBuffer htmlString = new StringBuffer(); 198 Node node; 199 String outString = ""; 200 201 try { 202 /* write InputStream input in StringBuffer htmlString */ 203 int c; 204 while ((c = input.read()) != -1) { 205 htmlString.append((char)c); 206 } 207 } catch (IOException e) { 208 if (CmsLog.INIT.isWarnEnabled()) { 209 CmsLog.INIT.warn( 210 Messages.get().getBundle().key( 211 Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0, 212 e.getLocalizedMessage())); 213 } 214 return; 215 } 216 outString = htmlString.toString(); 217 // extract from html if even both patterns are defined 218 if (CmsStringUtil.isNotEmpty(startPattern) && CmsStringUtil.isNotEmpty(endPattern)) { 219 String extractMain = extractHtml(outString, startPattern, endPattern); 220 if (extractMain.length() != outString.length()) { 221 String extractHead = extractHtml(outString, "<html>", CmsStringUtil.BODY_START_REGEX); 222 //String extractHead = extractHtml(extractMain, "<html>", CmsStringUtil.C_BODY_START_REGEX); 223 StringBuffer buffer = new StringBuffer(extractHead.length() + extractMain.length() + 255); 224 buffer.append("<html>"); 225 buffer.append(extractHead); 226 buffer.append("<body>"); 227 buffer.append(extractMain); 228 buffer.append("</body></html>"); 229 outString = buffer.toString(); 230 } 231 } 232 233 /* convert htmlString in InputStream for parseDOM */ 234 InputStream in; 235 try { 236 in = new ByteArrayInputStream(outString.getBytes(CmsEncoder.ENCODING_UTF_8)); 237 } catch (UnsupportedEncodingException e) { 238 // this should never happen since UTF-8 is always supported 239 in = new ByteArrayInputStream(outString.getBytes()); 240 } 241 m_tidy.setInputEncoding(CmsEncoder.ENCODING_UTF_8); 242 m_tidy.setOutputEncoding(CmsEncoder.ENCODING_UTF_8); 243 244 // hold tidy error information into a new PrintWriter Object 245 PrintWriter errorLog = new PrintWriter(new ByteArrayOutputStream(), true); 246 m_tidy.setErrout(errorLog); 247 248 node = m_tidy.parseDOM(in, null); 249 /* check if html code has errors */ 250 if (m_tidy.getParseErrors() != 0) { 251 if (CmsLog.INIT.isWarnEnabled()) { 252 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0)); 253 } 254 } 255 /* second step: create transformed output with printDocument from DOM */ 256 printDocument(node, properties); 257 258 try { 259 String content = m_tempString.toString(); 260 content = CmsStringUtil.substitute(content, "<br></br>", "<br>"); 261 content = CmsStringUtil.substitutePerl(content, "</a>(\\w+)", "</a> $1", "g"); 262 output.write(content); 263 output.close(); 264 265 } catch (IOException e) { 266 if (CmsLog.INIT.isWarnEnabled()) { 267 CmsLog.INIT.warn( 268 Messages.get().getBundle().key( 269 Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_1, 270 e.getLocalizedMessage())); 271 } 272 return; 273 } 274 } 275 276 /** 277 * Transforms HTML code into user defined output.<p> 278 * 279 * @param filename the absolute path in the real filesystem of the file to convert 280 * @param inString String with HTML code 281 * @param startPattern the start pattern definition for content extracting 282 * @param endPattern the end pattern definition for content extracting 283 * @param properties the file properties 284 * @return String with transformed code 285 */ 286 public String convertHTML( 287 String filename, 288 String inString, 289 String startPattern, 290 String endPattern, 291 Hashtable properties) { 292 293 m_tempString = new StringBuffer(); 294 m_write = true; 295 m_filename = filename.replace('\\', '/'); 296 Reader in = new StringReader(inString); 297 Writer out = new StringWriter(); 298 convertHTML(in, out, startPattern, endPattern, properties); 299 return out.toString(); 300 } 301 302 /** 303 * Initialises Vector m_enterTags with tag names.<p> 304 */ 305 private void initialiseTags() { 306 307 StringTokenizer T = new StringTokenizer( 308 "p,table,tr,td,body,head,script,pre,title,style,h1,h2,h3,h4,h5,h6,ul,ol,li", 309 ","); 310 while (T.hasMoreTokens()) { 311 m_enterTags.add(T.nextToken()); 312 } 313 } 314 315 /** 316 * Private method to parse DOM and create user defined output.<p> 317 * 318 * @param node Node of DOM from HTML code 319 * @param properties the file properties 320 */ 321 private void printDocument(Node node, Hashtable properties) { 322 323 // if node is empty do nothing... (Recursion) 324 if (node == null) { 325 return; 326 } 327 // initialise local variables 328 int type = node.getNodeType(); 329 String name = node.getNodeName(); 330 331 // detect node type 332 switch (type) { 333 case Node.DOCUMENT_NODE: 334 335 printDocument(((Document)node).getDocumentElement(), properties); 336 break; 337 case Node.ELEMENT_NODE: 338 339 // check if its the <head> node. Nothing inside the <head> node 340 // must be 341 // part of the output, but we must scan the content of this 342 // node to get all 343 // <meta> tags 344 if (name.equals(NODE_HEAD)) { 345 m_write = false; 346 } 347 // scan element node; if a block has to be removed or replaced, 348 // break and discard child nodes 349 transformStartElement(node, properties); 350 351 // test if node has children 352 NodeList children = node.getChildNodes(); 353 if (children != null) { 354 int len = children.getLength(); 355 for (int i = 0; i < len; i++) { 356 // recursively call printDocument with all child nodes 357 printDocument(children.item(i), properties); 358 } 359 } 360 break; 361 case Node.TEXT_NODE: 362 363 // replace subStrings in text nodes 364 transformTextNode(node); 365 break; 366 default: 367 368 break; 369 } 370 // end of recursion, add eventual endtags and suffixes 371 switch (type) { 372 case Node.ELEMENT_NODE: 373 // analyse endtags and add them to output 374 transformEndElement(node); 375 if (node.getNodeName().equals(NODE_HEAD)) { 376 m_write = true; 377 } 378 break; 379 case Node.DOCUMENT_NODE: 380 break; 381 default: 382 break; 383 } 384 } 385 386 /** 387 * Transform element nodes and create end tags in output.<p> 388 * 389 * @param node actual element node 390 */ 391 private void transformEndElement(Node node) { 392 393 // check hat kind of node we have 394 String nodeName = node.getNodeName(); 395 396 // the <HTML> and <BODY> node must be skipped 397 if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) { 398 // do nothing here 399 } else { 400 // only do some output if we are in writing mode 401 if (m_write) { 402 m_tempString.append("</"); 403 m_tempString.append(nodeName); 404 m_tempString.append(">"); 405 406 // append a "\n" to output String if possible 407 if (m_enterTags.contains(node.getNodeName())) { 408 m_tempString.append("\n"); 409 } 410 } 411 } 412 } 413 414 /** 415 * Transforms element nodes and create start tags in output. <p> 416 * 417 * @param node actual element node 418 * @param properties the file properties 419 */ 420 private void transformStartElement(Node node, Hashtable properties) { 421 422 // check hat kind of node we have 423 String nodeName = node.getNodeName(); 424 425 // the <HTML> and <BODY> node must be skipped 426 if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) { 427 // the <TITLE> node must be read and its value set as properties to 428 // the imported file 429 430 } else if (nodeName.equals(NODE_TITLE)) { 431 432 writeTitleProperty(node, properties); 433 434 } else if (nodeName.equals(NODE_META)) { 435 436 writeMetaTagProperty(node, properties); 437 438 } else if (nodeName.equals(NODE_HREF)) { 439 440 // only do some output if we are in writing mode 441 if (m_write) { 442 m_tempString.append("<"); 443 m_tempString.append(nodeName); 444 NamedNodeMap attrs = node.getAttributes(); 445 // look through all attribs to find the reference 446 for (int i = attrs.getLength() - 1; i >= 0; i--) { 447 String name = attrs.item(i).getNodeName(); 448 String value = attrs.item(i).getNodeValue(); 449 450 if (name.equals(ATTRIB_HREF)) { 451 452 // check if this is an external link 453 if (value.indexOf("://") > 0) { 454 // store it for later creation of an entry in the 455 // link gallery 456 String externalLinkFile = m_htmlImport.storeExternalLink(value); 457 if (externalLinkFile != null) { 458 value = m_htmlImport.getLinkGallery() + externalLinkFile; 459 } 460 } else if (!value.startsWith("mailto:") && !value.startsWith("javascript:")) { 461 462 // save an existing anchor link for later use 463 // if (value.indexOf("#") > 0) { 464 // String anchor = value.substring(value.indexOf("#"), value.length()); 465 // } 466 // get the new link into the VFS 467 String internalUri = m_htmlImport.getAbsoluteUri( 468 value, 469 m_filename.substring(0, m_filename.lastIndexOf("/") + 1)); 470 471 value = m_htmlImport.translateLink(internalUri); 472 } 473 } 474 475 m_tempString.append(" "); 476 m_tempString.append(name); 477 m_tempString.append("=\""); 478 m_tempString.append(value); 479 m_tempString.append("\""); 480 } 481 m_tempString.append(">"); 482 } 483 484 // this is a imasge, its reference must be converted 485 } else if (nodeName.equals(NODE_IMG)) { 486 487 // only do some output if we are in writing mode 488 if (m_write) { 489 m_tempString.append("<"); 490 m_tempString.append(nodeName); 491 NamedNodeMap attrs = node.getAttributes(); 492 // look through all attribs to find the src and alt attributes 493 String imagename = ""; 494 String altText = ""; 495 for (int i = attrs.getLength() - 1; i >= 0; i--) { 496 String name = attrs.item(i).getNodeName(); 497 String value = attrs.item(i).getNodeValue(); 498 if (name.equals(ATTRIB_SRC)) { 499 // we found the src. now check if it refers to an 500 // external image. 501 // if not, we must get the correct location in the VFS 502 if (value.indexOf("://") <= 0) { 503 imagename = m_htmlImport.getAbsoluteUri( 504 value, 505 m_filename.substring(0, m_filename.lastIndexOf("/") + 1)); 506 value = m_htmlImport.translateLink(imagename); 507 } 508 } else if (name.equals(ATTRIB_ALT)) { 509 altText = value; 510 } 511 512 m_tempString.append(" "); 513 m_tempString.append(name); 514 m_tempString.append("=\""); 515 m_tempString.append(value); 516 m_tempString.append("\""); 517 } 518 519 //store the alt tag of this image for later use 520 m_htmlImport.storeImageInfo(imagename, altText); 521 522 m_tempString.append(">"); 523 } 524 } else { 525 526 // only do some output if we are in writing mode 527 if (m_write) { 528 529 m_tempString.append("<"); 530 m_tempString.append(nodeName); 531 NamedNodeMap attrs = node.getAttributes(); 532 for (int i = attrs.getLength() - 1; i >= 0; i--) { 533 m_tempString.append(" " + attrs.item(i).getNodeName() + "=" + "\""); 534 /* scan attribute values and replace subStrings */ 535 m_tempString.append(attrs.item(i).getNodeValue() + "\""); 536 } 537 m_tempString.append(">"); 538 } 539 } 540 } 541 542 /** 543 * Private method to transform text nodes.<p> 544 * 545 * @param node actual text node 546 */ 547 private void transformTextNode(Node node) { 548 549 // only do some output if we are in writing mode 550 if (m_write) { 551 String helpString = node.getNodeValue(); 552 m_tempString.append(helpString); 553 } 554 } 555 556 /** 557 * Writes meta tags as cms properties by analyzing the meta tags nodes.<p> 558 * 559 * @param node the meta tag node in html document 560 * @param properties the properties hashtable 561 */ 562 private void writeMetaTagProperty(Node node, Hashtable properties) { 563 564 NamedNodeMap attrs = node.getAttributes(); 565 String metaName = ""; 566 String metaContent = ""; 567 // look through all attribs to find the name and content attributes 568 for (int i = attrs.getLength() - 1; i >= 0; i--) { 569 String name = attrs.item(i).getNodeName(); 570 String value = attrs.item(i).getNodeValue(); 571 if (name.equals(ATTRIB_NAME)) { 572 metaName = value; 573 } else if (name.equals(ATTRIB_CONTENT)) { 574 metaContent = value; 575 } 576 } 577 // check if we have valid entries for this <META> node, store them 578 // in the properties 579 if ((metaName.length() > 0) && (metaContent.length() > 0)) { 580 properties.put(metaName, CmsStringUtil.substitute(metaContent, "{subst}", "&#")); 581 } 582 } 583 584 /** 585 * Sets the Property title by analyzing the title node.<p> 586 * 587 * @param node the title node in html document 588 * @param properties the properties hashtable 589 */ 590 private void writeTitleProperty(Node node, Hashtable properties) { 591 592 String title = ""; 593 // the title string is stored in the first child node 594 NodeList children = node.getChildNodes(); 595 if (children != null) { 596 Node titleNode = children.item(0); 597 if (titleNode != null) { 598 title = titleNode.getNodeValue(); 599 } 600 } 601 // add the title property if we have one 602 if ((title != null) && (title.length() > 0)) { 603 604 properties.put(CmsPropertyDefinition.PROPERTY_TITLE, CmsStringUtil.substitute(title, "{subst}", "&#")); 605 // the title will be used as navtext if no other navtext is 606 // given 607 if (properties.get(CmsPropertyDefinition.PROPERTY_NAVTEXT) == null) { 608 properties.put( 609 CmsPropertyDefinition.PROPERTY_NAVTEXT, 610 CmsStringUtil.substitute(title, "{subst}", "&#")); 611 } 612 } 613 614 } 615 616}