001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.staticexport; 029 030import org.opencms.file.CmsObject; 031import org.opencms.file.CmsPropertyDefinition; 032import org.opencms.file.wrapper.CmsObjectWrapper; 033import org.opencms.gwt.shared.CmsGwtConstants; 034import org.opencms.i18n.CmsEncoder; 035import org.opencms.main.CmsException; 036import org.opencms.main.OpenCms; 037import org.opencms.relations.CmsLink; 038import org.opencms.relations.CmsRelationType; 039import org.opencms.util.CmsHtmlParser; 040import org.opencms.util.CmsMacroResolver; 041import org.opencms.util.CmsRequestUtil; 042import org.opencms.util.CmsStringUtil; 043 044import java.util.Vector; 045 046import org.htmlparser.Attribute; 047import org.htmlparser.Node; 048import org.htmlparser.Tag; 049import org.htmlparser.tags.ImageTag; 050import org.htmlparser.tags.LinkTag; 051import org.htmlparser.tags.ObjectTag; 052import org.htmlparser.util.ParserException; 053import org.htmlparser.util.SimpleNodeIterator; 054 055/** 056 * Implements the HTML parser node visitor pattern to 057 * exchange all links on the page.<p> 058 * 059 * @since 6.0.0 060 */ 061public class CmsLinkProcessor extends CmsHtmlParser { 062 063 /** Constant for the attribute name. */ 064 public static final String ATTRIBUTE_HREF = "href"; 065 066 /** Constant for the attribute name. */ 067 public static final String ATTRIBUTE_SRC = "src"; 068 069 /** Constant for the attribute name. */ 070 public static final String ATTRIBUTE_VALUE = "value"; 071 072 /** HTML end. */ 073 public static final String HTML_END = "</body></html>"; 074 075 /** HTML start. */ 076 public static final String HTML_START = "<html><body>"; 077 078 /** Constant for the tag name. */ 079 public static final String TAG_AREA = "AREA"; 080 081 /** Constant for the tag name. */ 082 public static final String TAG_EMBED = "EMBED"; 083 084 /** Constant for the tag name. */ 085 public static final String TAG_IFRAME = "IFRAME"; 086 087 /** Constant for the tag name. */ 088 public static final String TAG_PARAM = "PARAM"; 089 090 /** List of attributes that may contain links for the embed tag. */ 091 private static final String[] EMBED_TAG_LINKED_ATTRIBS = new String[] {ATTRIBUTE_SRC, "pluginurl", "pluginspage"}; 092 093 /** List of attributes that may contain links for the object tag ("codebase" has to be first). */ 094 private static final String[] OBJECT_TAG_LINKED_ATTRIBS = new String[] {"codebase", "data", "datasrc"}; 095 096 /** Processing mode "process links" (macros to links). */ 097 private static final int PROCESS_LINKS = 1; 098 099 /** Processing mode "replace links" (links to macros). */ 100 private static final int REPLACE_LINKS = 0; 101 102 /** The current users OpenCms context, containing the users permission and site root context. */ 103 private CmsObject m_cms; 104 105 /** The selected encoding to use for parsing the HTML. */ 106 private String m_encoding; 107 108 /** The link table used for link macro replacements. */ 109 private CmsLinkTable m_linkTable; 110 111 /** Current processing mode. */ 112 private int m_mode; 113 114 /** The relative path for relative links, if not set, relative links are treated as external links. */ 115 private String m_relativePath; 116 117 /** Another OpenCms context based on the current users OpenCms context, but with the site root set to '/'. */ 118 private CmsObject m_rootCms; 119 120 /** 121 * Creates a new link processor.<p> 122 * 123 * @param cms the current users OpenCms context 124 * @param linkTable the link table to use 125 * @param encoding the encoding to use for parsing the HTML content 126 * @param relativePath additional path for links with relative path (only used in "replace" mode) 127 */ 128 public CmsLinkProcessor(CmsObject cms, CmsLinkTable linkTable, String encoding, String relativePath) { 129 130 // echo mode must be on for link processor 131 super(true); 132 133 m_cms = cms; 134 if (m_cms != null) { 135 try { 136 m_rootCms = OpenCms.initCmsObject(cms); 137 m_rootCms.getRequestContext().setSiteRoot("/"); 138 } catch (CmsException e) { 139 // this should not happen 140 m_rootCms = null; 141 } 142 } 143 m_linkTable = linkTable; 144 m_encoding = encoding; 145 m_relativePath = relativePath; 146 } 147 148 /** 149 * Escapes all <code>&</code>, e.g. replaces them with a <code>&</code>.<p> 150 * 151 * @param source the String to escape 152 * @return the escaped String 153 */ 154 public static String escapeLink(String source) { 155 156 if (source == null) { 157 return null; 158 } 159 StringBuffer result = new StringBuffer(source.length() * 2); 160 int terminatorIndex; 161 for (int i = 0; i < source.length(); ++i) { 162 char ch = source.charAt(i); 163 switch (ch) { 164 case '&': 165 // don't escape already escaped &s; 166 terminatorIndex = source.indexOf(';', i); 167 if (terminatorIndex > 0) { 168 String substr = source.substring(i + 1, terminatorIndex); 169 if ("amp".equals(substr)) { 170 result.append(ch); 171 } else { 172 result.append("&"); 173 } 174 } else { 175 result.append("&"); 176 } 177 break; 178 default: 179 result.append(ch); 180 } 181 } 182 return new String(result); 183 } 184 185 /** 186 * Unescapes all <code>&amp;</code>, that is replaces them with a <code>&</code>.<p> 187 * 188 * @param source the String to unescape 189 * @return the unescaped String 190 */ 191 public static String unescapeLink(String source) { 192 193 if (source == null) { 194 return null; 195 } 196 return CmsStringUtil.substitute(source, "&", "&"); 197 198 } 199 200 /** 201 * Returns the link table this link processor was initialized with.<p> 202 * 203 * @return the link table this link processor was initialized with 204 */ 205 public CmsLinkTable getLinkTable() { 206 207 return m_linkTable; 208 } 209 210 /** 211 * Starts link processing for the given content in processing mode.<p> 212 * 213 * Macros are replaced by links.<p> 214 * 215 * @param content the content to process 216 * @return the processed content with replaced macros 217 * 218 * @throws ParserException if something goes wrong 219 */ 220 public String processLinks(String content) throws ParserException { 221 222 m_mode = PROCESS_LINKS; 223 return process(content, m_encoding); 224 } 225 226 /** 227 * Starts link processing for the given content in replacement mode.<p> 228 * 229 * Links are replaced by macros.<p> 230 * 231 * @param content the content to process 232 * @return the processed content with replaced links 233 * 234 * @throws ParserException if something goes wrong 235 */ 236 public String replaceLinks(String content) throws ParserException { 237 238 m_mode = REPLACE_LINKS; 239 return process(content, m_encoding); 240 } 241 242 /** 243 * Visitor method to process a tag (start).<p> 244 * 245 * @param tag the tag to process 246 */ 247 @Override 248 public void visitTag(Tag tag) { 249 250 if (tag instanceof LinkTag) { 251 processLinkTag((LinkTag)tag); 252 } else if (tag instanceof ImageTag) { 253 processImageTag((ImageTag)tag); 254 } else if (tag instanceof ObjectTag) { 255 processObjectTag((ObjectTag)tag); 256 } else { 257 // there are no specialized tag classes for these tags :( 258 if (TAG_EMBED.equals(tag.getTagName())) { 259 processEmbedTag(tag); 260 } else if (TAG_AREA.equals(tag.getTagName())) { 261 processAreaTag(tag); 262 } else if (TAG_IFRAME.equals(tag.getTagName())) { 263 String src = tag.getAttribute(ATTRIBUTE_SRC); 264 if ((src != null) && !src.startsWith("//")) { 265 // link processing does not work for protocol-relative URLs, which were once used in Youtube embed 266 // codes. 267 processLink(tag, ATTRIBUTE_SRC, CmsRelationType.HYPERLINK); 268 } 269 } 270 } 271 // append text content of the tag (may have been changed by above methods) 272 super.visitTag(tag); 273 } 274 275 /** 276 * Process an area tag.<p> 277 * 278 * @param tag the tag to process 279 */ 280 protected void processAreaTag(Tag tag) { 281 282 processLink(tag, ATTRIBUTE_HREF, CmsRelationType.HYPERLINK); 283 } 284 285 /** 286 * Process an embed tag.<p> 287 * 288 * @param tag the tag to process 289 */ 290 protected void processEmbedTag(Tag tag) { 291 292 for (int i = 0; i < EMBED_TAG_LINKED_ATTRIBS.length; i++) { 293 String attr = EMBED_TAG_LINKED_ATTRIBS[i]; 294 processLink(tag, attr, CmsRelationType.EMBEDDED_OBJECT); 295 } 296 } 297 298 /** 299 * Process an image tag.<p> 300 * 301 * @param tag the tag to process 302 */ 303 protected void processImageTag(ImageTag tag) { 304 305 processLink(tag, ATTRIBUTE_SRC, CmsRelationType.valueOf(tag.getTagName())); 306 } 307 308 /** 309 * Process a tag having a link in the given attribute, considering the link as the given type.<p> 310 * 311 * @param tag the tag to process 312 * @param attr the attribute 313 * @param type the link type 314 */ 315 protected void processLink(Tag tag, String attr, CmsRelationType type) { 316 317 if (tag.getAttribute(attr) == null) { 318 return; 319 } 320 CmsLink link = null; 321 322 switch (m_mode) { 323 case PROCESS_LINKS: 324 // macros are replaced with links 325 link = m_linkTable.getLink(CmsMacroResolver.stripMacro(tag.getAttribute(attr))); 326 if (link != null) { 327 // link management check 328 String l = link.getLink(m_cms); 329 if (TAG_PARAM.equals(tag.getTagName())) { 330 // HACK: to distinguish link parameters the link itself has to end with '&' or '?' 331 // another solution should be a kind of macro... 332 if (!l.endsWith(CmsRequestUtil.URL_DELIMITER) 333 && !l.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) { 334 if (l.indexOf(CmsRequestUtil.URL_DELIMITER) > 0) { 335 l += CmsRequestUtil.PARAMETER_DELIMITER; 336 } else { 337 l += CmsRequestUtil.URL_DELIMITER; 338 } 339 } 340 } 341 // set the real target 342 tag.setAttribute(attr, CmsEncoder.escapeXml(l)); 343 344 // In the Online project, remove href attributes with broken links from A tags. 345 if (tag.getTagName().equalsIgnoreCase("A") 346 && m_cms.getRequestContext().isOnlineOrEditDisabled() 347 && link.isInternal() 348 && (link.getResource() == null)) { 349 // getResource() == null could either mean checkConsistency has not been called, or that the link is broken. 350 // so we have to call checkConsistency to eliminate the first possibility. 351 link.checkConsistency(m_cms); 352 if (link.getResource() == null) { 353 tag.removeAttribute(ATTRIBUTE_HREF); 354 tag.setAttribute(CmsGwtConstants.ATTR_DEAD_LINK_MARKER, "true"); 355 } 356 } 357 } 358 break; 359 case REPLACE_LINKS: 360 // links are replaced with macros 361 String targetUri = tag.getAttribute(attr); 362 if (CmsStringUtil.isNotEmpty(targetUri)) { 363 String internalUri = null; 364 if (!CmsMacroResolver.isMacro(targetUri)) { 365 m_cms.getRequestContext().setAttribute( 366 CmsDefaultLinkSubstitutionHandler.DONT_USE_CURRENT_SITE_FOR_WORKPLACE_REQUESTS, 367 "true"); 368 internalUri = OpenCms.getLinkManager().getRootPath(m_cms, targetUri, m_relativePath); 369 } 370 // HACK: to distinguish link parameters the link itself has to end with '&' or '?' 371 // another solution should be a kind of macro... 372 if (!TAG_PARAM.equals(tag.getTagName()) 373 || targetUri.endsWith(CmsRequestUtil.URL_DELIMITER) 374 || targetUri.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) { 375 if (internalUri != null) { 376 internalUri = rewriteUri(internalUri); 377 // this is an internal link 378 link = m_linkTable.addLink(type, internalUri, true); 379 // link management check 380 link.checkConsistency(m_cms); 381 382 if ("IMG".equals(tag.getTagName()) || TAG_AREA.equals(tag.getTagName())) { 383 // now ensure the image has the "alt" attribute set 384 setAltAttributeFromTitle(tag, internalUri); 385 } 386 } else { 387 // this is an external link 388 link = m_linkTable.addLink(type, targetUri, false); 389 } 390 } 391 if (link != null) { 392 tag.setAttribute(attr, CmsMacroResolver.formatMacro(link.getName())); 393 } 394 } 395 break; 396 default: // empty 397 } 398 } 399 400 /** 401 * Process a link tag.<p> 402 * 403 * @param tag the tag to process 404 */ 405 protected void processLinkTag(LinkTag tag) { 406 407 processLink(tag, ATTRIBUTE_HREF, CmsRelationType.valueOf(tag.getTagName())); 408 } 409 410 /** 411 * Process an object tag.<p> 412 * 413 * @param tag the tag to process 414 */ 415 protected void processObjectTag(ObjectTag tag) { 416 417 CmsRelationType type = CmsRelationType.valueOf(tag.getTagName()); 418 for (int i = 0; i < OBJECT_TAG_LINKED_ATTRIBS.length; i++) { 419 String attr = OBJECT_TAG_LINKED_ATTRIBS[i]; 420 processLink(tag, attr, type); 421 if ((i == 0) && (tag.getAttribute(attr) != null)) { 422 // if code base is available, the other attributes are relative to it, so do not process them 423 break; 424 } 425 } 426 SimpleNodeIterator itChildren = tag.children(); 427 while (itChildren.hasMoreNodes()) { 428 Node node = itChildren.nextNode(); 429 if (node instanceof Tag) { 430 Tag childTag = (Tag)node; 431 if (TAG_PARAM.equals(childTag.getTagName())) { 432 processLink(childTag, ATTRIBUTE_VALUE, type); 433 } 434 } 435 } 436 } 437 438 /** 439 * Ensures that the given tag has the "alt" attribute set.<p> 440 * 441 * if not set, it will be set from the title of the given resource.<p> 442 * 443 * @param tag the tag to set the alt attribute for 444 * @param internalUri the internal URI to get the title from 445 */ 446 protected void setAltAttributeFromTitle(Tag tag, String internalUri) { 447 448 boolean hasAltAttrib = (tag.getAttribute("alt") != null); 449 if (!hasAltAttrib) { 450 String value = null; 451 if ((internalUri != null) && (m_rootCms != null)) { 452 // internal image: try to read the "alt" text from the "Title" property 453 try { 454 value = m_rootCms.readPropertyObject( 455 internalUri, 456 CmsPropertyDefinition.PROPERTY_TITLE, 457 false).getValue(); 458 } catch (CmsException e) { 459 // property can't be read, ignore 460 } 461 } 462 // some editors add a "/" at the end of the tag, we must make sure to insert before that 463 @SuppressWarnings("unchecked") 464 Vector<Attribute> attrs = tag.getAttributesEx(); 465 // first element is always the tag name 466 attrs.add(1, new Attribute(" ")); 467 attrs.add(2, new Attribute("alt", value == null ? "" : value, '"')); 468 } 469 } 470 471 /** 472 * Use the {@link org.opencms.file.wrapper.CmsObjectWrapper} to restore the link in the VFS.<p> 473 * 474 * @param internalUri the internal URI to restore 475 * 476 * @return the restored URI 477 */ 478 private String rewriteUri(String internalUri) { 479 480 // if an object wrapper is used, rewrite the uri 481 if (m_cms != null) { 482 Object obj = m_cms.getRequestContext().getAttribute(CmsObjectWrapper.ATTRIBUTE_NAME); 483 if (obj != null) { 484 CmsObjectWrapper wrapper = (CmsObjectWrapper)obj; 485 return wrapper.restoreLink(internalUri); 486 } 487 } 488 489 return internalUri; 490 } 491}