001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.staticexport; 029 030import org.opencms.file.CmsObject; 031import org.opencms.file.CmsPropertyDefinition; 032import org.opencms.file.wrapper.CmsObjectWrapper; 033import org.opencms.gwt.shared.CmsGwtConstants; 034import org.opencms.i18n.CmsEncoder; 035import org.opencms.main.CmsException; 036import org.opencms.main.OpenCms; 037import org.opencms.relations.CmsLink; 038import org.opencms.relations.CmsRelationType; 039import org.opencms.util.CmsHtmlParser; 040import org.opencms.util.CmsMacroResolver; 041import org.opencms.util.CmsRequestUtil; 042import org.opencms.util.CmsStringUtil; 043import org.opencms.util.CmsUUID; 044 045import java.util.Vector; 046 047import org.htmlparser.Attribute; 048import org.htmlparser.Node; 049import org.htmlparser.Tag; 050import org.htmlparser.tags.ImageTag; 051import org.htmlparser.tags.LinkTag; 052import org.htmlparser.tags.ObjectTag; 053import org.htmlparser.util.ParserException; 054import org.htmlparser.util.SimpleNodeIterator; 055 056/** 057 * Implements the HTML parser node visitor pattern to 058 * exchange all links on the page.<p> 059 * 060 * @since 6.0.0 061 */ 062public class CmsLinkProcessor extends CmsHtmlParser { 063 064 /** Constant for the attribute name. */ 065 public static final String ATTRIBUTE_HREF = "href"; 066 067 /** Constant for the attribute name. */ 068 public static final String ATTRIBUTE_SRC = "src"; 069 070 /** Constant for the attribute name. */ 071 public static final String ATTRIBUTE_VALUE = "value"; 072 073 /** HTML end. */ 074 public static final String HTML_END = "</body></html>"; 075 076 /** HTML start. */ 077 public static final String HTML_START = "<html><body>"; 078 079 /** Constant for the tag name. */ 080 public static final String TAG_AREA = "AREA"; 081 082 /** Constant for the tag name. */ 083 public static final String TAG_EMBED = "EMBED"; 084 085 /** Constant for the tag name. */ 086 public static final String TAG_IFRAME = "IFRAME"; 087 088 /** Constant for the tag name. */ 089 public static final String TAG_PARAM = "PARAM"; 090 091 /** List of attributes that may contain links for the embed tag. */ 092 private static final String[] EMBED_TAG_LINKED_ATTRIBS = new String[] {ATTRIBUTE_SRC, "pluginurl", "pluginspage"}; 093 094 /** List of attributes that may contain links for the object tag ("codebase" has to be first). */ 095 private static final String[] OBJECT_TAG_LINKED_ATTRIBS = new String[] {"codebase", "data", "datasrc"}; 096 097 /** Processing mode "process links" (macros to links). */ 098 private static final int PROCESS_LINKS = 1; 099 100 /** Processing mode "replace links" (links to macros). */ 101 private static final int REPLACE_LINKS = 0; 102 103 /** The current users OpenCms context, containing the users permission and site root context. */ 104 private CmsObject m_cms; 105 106 /** The selected encoding to use for parsing the HTML. */ 107 private String m_encoding; 108 109 /** The link table used for link macro replacements. */ 110 private CmsLinkTable m_linkTable; 111 112 /** Current processing mode. */ 113 private int m_mode; 114 115 /** The relative path for relative links, if not set, relative links are treated as external links. */ 116 private String m_relativePath; 117 118 /** Another OpenCms context based on the current users OpenCms context, but with the site root set to '/'. */ 119 private CmsObject m_rootCms; 120 121 /** 122 * Creates a new link processor.<p> 123 * 124 * @param cms the current users OpenCms context 125 * @param linkTable the link table to use 126 * @param encoding the encoding to use for parsing the HTML content 127 * @param relativePath additional path for links with relative path (only used in "replace" mode) 128 */ 129 public CmsLinkProcessor(CmsObject cms, CmsLinkTable linkTable, String encoding, String relativePath) { 130 131 // echo mode must be on for link processor 132 super(true); 133 134 m_cms = cms; 135 if (m_cms != null) { 136 try { 137 m_rootCms = OpenCms.initCmsObject(cms); 138 m_rootCms.getRequestContext().setSiteRoot("/"); 139 } catch (CmsException e) { 140 // this should not happen 141 m_rootCms = null; 142 } 143 } 144 m_linkTable = linkTable; 145 m_encoding = encoding; 146 m_relativePath = relativePath; 147 } 148 149 /** 150 * Escapes all <code>&</code>, e.g. replaces them with a <code>&</code>.<p> 151 * 152 * @param source the String to escape 153 * @return the escaped String 154 */ 155 public static String escapeLink(String source) { 156 157 if (source == null) { 158 return null; 159 } 160 StringBuffer result = new StringBuffer(source.length() * 2); 161 int terminatorIndex; 162 for (int i = 0; i < source.length(); ++i) { 163 char ch = source.charAt(i); 164 switch (ch) { 165 case '&': 166 // don't escape already escaped &s; 167 terminatorIndex = source.indexOf(';', i); 168 if (terminatorIndex > 0) { 169 String substr = source.substring(i + 1, terminatorIndex); 170 if ("amp".equals(substr)) { 171 result.append(ch); 172 } else { 173 result.append("&"); 174 } 175 } else { 176 result.append("&"); 177 } 178 break; 179 default: 180 result.append(ch); 181 } 182 } 183 return new String(result); 184 } 185 186 /** 187 * Unescapes all <code>&amp;</code>, that is replaces them with a <code>&</code>.<p> 188 * 189 * @param source the String to unescape 190 * @return the unescaped String 191 */ 192 public static String unescapeLink(String source) { 193 194 if (source == null) { 195 return null; 196 } 197 return CmsStringUtil.substitute(source, "&", "&"); 198 199 } 200 201 /** 202 * Returns the link table this link processor was initialized with.<p> 203 * 204 * @return the link table this link processor was initialized with 205 */ 206 public CmsLinkTable getLinkTable() { 207 208 return m_linkTable; 209 } 210 211 /** 212 * Starts link processing for the given content in processing mode.<p> 213 * 214 * Macros are replaced by links.<p> 215 * 216 * @param content the content to process 217 * @return the processed content with replaced macros 218 * 219 * @throws ParserException if something goes wrong 220 */ 221 public String processLinks(String content) throws ParserException { 222 223 m_mode = PROCESS_LINKS; 224 return process(content, m_encoding); 225 } 226 227 /** 228 * Starts link processing for the given content in replacement mode.<p> 229 * 230 * Links are replaced by macros.<p> 231 * 232 * @param content the content to process 233 * @return the processed content with replaced links 234 * 235 * @throws ParserException if something goes wrong 236 */ 237 public String replaceLinks(String content) throws ParserException { 238 239 m_mode = REPLACE_LINKS; 240 return process(content, m_encoding); 241 } 242 243 /** 244 * Visitor method to process a tag (start).<p> 245 * 246 * @param tag the tag to process 247 */ 248 @Override 249 public void visitTag(Tag tag) { 250 251 if (tag instanceof LinkTag) { 252 processLinkTag((LinkTag)tag); 253 } else if (tag instanceof ImageTag) { 254 processImageTag((ImageTag)tag); 255 } else if (tag instanceof ObjectTag) { 256 processObjectTag((ObjectTag)tag); 257 } else { 258 // there are no specialized tag classes for these tags :( 259 if (TAG_EMBED.equals(tag.getTagName())) { 260 processEmbedTag(tag); 261 } else if (TAG_AREA.equals(tag.getTagName())) { 262 processAreaTag(tag); 263 } else if (TAG_IFRAME.equals(tag.getTagName())) { 264 String src = tag.getAttribute(ATTRIBUTE_SRC); 265 if ((src != null) && !src.startsWith("//")) { 266 // link processing does not work for protocol-relative URLs, which were once used in Youtube embed 267 // codes. 268 processLink(tag, ATTRIBUTE_SRC, CmsRelationType.HYPERLINK); 269 } 270 } 271 } 272 // append text content of the tag (may have been changed by above methods) 273 super.visitTag(tag); 274 } 275 276 /** 277 * Process an area tag.<p> 278 * 279 * @param tag the tag to process 280 */ 281 protected void processAreaTag(Tag tag) { 282 283 processLink(tag, ATTRIBUTE_HREF, CmsRelationType.HYPERLINK); 284 } 285 286 /** 287 * Process an embed tag.<p> 288 * 289 * @param tag the tag to process 290 */ 291 protected void processEmbedTag(Tag tag) { 292 293 for (int i = 0; i < EMBED_TAG_LINKED_ATTRIBS.length; i++) { 294 String attr = EMBED_TAG_LINKED_ATTRIBS[i]; 295 processLink(tag, attr, CmsRelationType.EMBEDDED_OBJECT); 296 } 297 } 298 299 /** 300 * Process an image tag.<p> 301 * 302 * @param tag the tag to process 303 */ 304 protected void processImageTag(ImageTag tag) { 305 306 processLink(tag, ATTRIBUTE_SRC, CmsRelationType.valueOf(tag.getTagName())); 307 } 308 309 /** 310 * Process a tag having a link in the given attribute, considering the link as the given type.<p> 311 * 312 * @param tag the tag to process 313 * @param attr the attribute 314 * @param type the link type 315 */ 316 protected void processLink(Tag tag, String attr, CmsRelationType type) { 317 318 if (tag.getAttribute(attr) == null) { 319 return; 320 } 321 CmsLink link = null; 322 323 switch (m_mode) { 324 case PROCESS_LINKS: 325 // macros are replaced with links 326 link = m_linkTable.getLink(CmsMacroResolver.stripMacro(tag.getAttribute(attr))); 327 if (link != null) { 328 // link management check 329 String l = link.getLink(m_cms); 330 if (TAG_PARAM.equals(tag.getTagName())) { 331 // HACK: to distinguish link parameters the link itself has to end with '&' or '?' 332 // another solution should be a kind of macro... 333 if (!l.endsWith(CmsRequestUtil.URL_DELIMITER) 334 && !l.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) { 335 if (l.indexOf(CmsRequestUtil.URL_DELIMITER) > 0) { 336 l += CmsRequestUtil.PARAMETER_DELIMITER; 337 } else { 338 l += CmsRequestUtil.URL_DELIMITER; 339 } 340 } 341 } 342 // set the real target 343 tag.setAttribute(attr, CmsEncoder.escapeXml(l)); 344 345 // In the Online project, remove href attributes with broken links from A tags. 346 // Exception: We don't do this if the target is empty, because fragment links ('#anchor') 347 // in the WYSIWYG editor are stored as internal links with empty targets 348 if (tag.getTagName().equalsIgnoreCase("A") 349 && m_cms.getRequestContext().isOnlineOrEditDisabled() 350 && link.isInternal() 351 && !CmsStringUtil.isEmpty(link.getTarget()) 352 && (link.getResource() == null)) { 353 354 // getResource() == null could either mean checkConsistency has not been called, or that the link is broken. 355 // so we have to call checkConsistency to eliminate the first possibility. 356 link.checkConsistency(m_cms); 357 // The consistency check tries to read the resource by id first, and then by path if this fails. If at some point in this process 358 // we get a security exception, then there must be some resource there, either for the given id or for the path, although we don't 359 // know at this point in the code which one it is. But it doesn't matter; because a potential link target exists, we don't remove the link. 360 if ((link.getResource() == null) 361 && !CmsUUID.getNullUUID().equals( 362 link.getStructureId()) /* 00000000-0000-0000-0000-000000000000 corresponds to static resource served from Jar file. We probably don't need that in the Online project, but we don't need to actively remove that, either. */ 363 && !link.hadSecurityErrorDuringLastConsistencyCheck()) { 364 tag.removeAttribute(ATTRIBUTE_HREF); 365 tag.setAttribute(CmsGwtConstants.ATTR_DEAD_LINK_MARKER, "true"); 366 } 367 } 368 } 369 break; 370 case REPLACE_LINKS: 371 // links are replaced with macros 372 String targetUri = tag.getAttribute(attr); 373 if (CmsStringUtil.isNotEmpty(targetUri)) { 374 String internalUri = null; 375 if (!CmsMacroResolver.isMacro(targetUri)) { 376 m_cms.getRequestContext().setAttribute( 377 CmsDefaultLinkSubstitutionHandler.DONT_USE_CURRENT_SITE_FOR_WORKPLACE_REQUESTS, 378 "true"); 379 internalUri = OpenCms.getLinkManager().getRootPath(m_cms, targetUri, m_relativePath); 380 } 381 // HACK: to distinguish link parameters the link itself has to end with '&' or '?' 382 // another solution should be a kind of macro... 383 if (!TAG_PARAM.equals(tag.getTagName()) 384 || targetUri.endsWith(CmsRequestUtil.URL_DELIMITER) 385 || targetUri.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) { 386 if (internalUri != null) { 387 internalUri = rewriteUri(internalUri); 388 // this is an internal link 389 link = m_linkTable.addLink(type, internalUri, true); 390 // link management check 391 link.checkConsistency(m_cms); 392 393 if ("IMG".equals(tag.getTagName()) || TAG_AREA.equals(tag.getTagName())) { 394 // now ensure the image has the "alt" attribute set 395 setAltAttributeFromTitle(tag, internalUri); 396 } 397 } else { 398 // this is an external link 399 link = m_linkTable.addLink(type, targetUri, false); 400 } 401 } 402 if (link != null) { 403 tag.setAttribute(attr, CmsMacroResolver.formatMacro(link.getName())); 404 } 405 } 406 break; 407 default: // empty 408 } 409 } 410 411 /** 412 * Process a link tag.<p> 413 * 414 * @param tag the tag to process 415 */ 416 protected void processLinkTag(LinkTag tag) { 417 418 processLink(tag, ATTRIBUTE_HREF, CmsRelationType.valueOf(tag.getTagName())); 419 } 420 421 /** 422 * Process an object tag.<p> 423 * 424 * @param tag the tag to process 425 */ 426 protected void processObjectTag(ObjectTag tag) { 427 428 CmsRelationType type = CmsRelationType.valueOf(tag.getTagName()); 429 for (int i = 0; i < OBJECT_TAG_LINKED_ATTRIBS.length; i++) { 430 String attr = OBJECT_TAG_LINKED_ATTRIBS[i]; 431 processLink(tag, attr, type); 432 if ((i == 0) && (tag.getAttribute(attr) != null)) { 433 // if code base is available, the other attributes are relative to it, so do not process them 434 break; 435 } 436 } 437 SimpleNodeIterator itChildren = tag.children(); 438 while (itChildren.hasMoreNodes()) { 439 Node node = itChildren.nextNode(); 440 if (node instanceof Tag) { 441 Tag childTag = (Tag)node; 442 if (TAG_PARAM.equals(childTag.getTagName())) { 443 processLink(childTag, ATTRIBUTE_VALUE, type); 444 } 445 } 446 } 447 } 448 449 /** 450 * Ensures that the given tag has the "alt" attribute set.<p> 451 * 452 * if not set, it will be set from the title of the given resource.<p> 453 * 454 * @param tag the tag to set the alt attribute for 455 * @param internalUri the internal URI to get the title from 456 */ 457 protected void setAltAttributeFromTitle(Tag tag, String internalUri) { 458 459 boolean hasAltAttrib = (tag.getAttribute("alt") != null); 460 if (!hasAltAttrib) { 461 String value = null; 462 if ((internalUri != null) && (m_rootCms != null)) { 463 // internal image: try to read the "alt" text from the "Title" property 464 try { 465 value = m_rootCms.readPropertyObject( 466 internalUri, 467 CmsPropertyDefinition.PROPERTY_TITLE, 468 false).getValue(); 469 } catch (CmsException e) { 470 // property can't be read, ignore 471 } 472 } 473 // some editors add a "/" at the end of the tag, we must make sure to insert before that 474 @SuppressWarnings("unchecked") 475 Vector<Attribute> attrs = tag.getAttributesEx(); 476 // first element is always the tag name 477 attrs.add(1, new Attribute(" ")); 478 attrs.add(2, new Attribute("alt", value == null ? "" : value, '"')); 479 } 480 } 481 482 /** 483 * Use the {@link org.opencms.file.wrapper.CmsObjectWrapper} to restore the link in the VFS.<p> 484 * 485 * @param internalUri the internal URI to restore 486 * 487 * @return the restored URI 488 */ 489 private String rewriteUri(String internalUri) { 490 491 // if an object wrapper is used, rewrite the uri 492 if (m_cms != null) { 493 Object obj = m_cms.getRequestContext().getAttribute(CmsObjectWrapper.ATTRIBUTE_NAME); 494 if (obj != null) { 495 CmsObjectWrapper wrapper = (CmsObjectWrapper)obj; 496 return wrapper.restoreLink(internalUri); 497 } 498 } 499 500 return internalUri; 501 } 502}