001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.staticexport; 029 030import org.opencms.file.CmsObject; 031import org.opencms.file.CmsPropertyDefinition; 032import org.opencms.file.wrapper.CmsObjectWrapper; 033import org.opencms.i18n.CmsEncoder; 034import org.opencms.main.CmsException; 035import org.opencms.main.OpenCms; 036import org.opencms.relations.CmsLink; 037import org.opencms.relations.CmsRelationType; 038import org.opencms.util.CmsHtmlParser; 039import org.opencms.util.CmsMacroResolver; 040import org.opencms.util.CmsRequestUtil; 041import org.opencms.util.CmsStringUtil; 042 043import java.util.Vector; 044 045import org.htmlparser.Attribute; 046import org.htmlparser.Node; 047import org.htmlparser.Tag; 048import org.htmlparser.tags.ImageTag; 049import org.htmlparser.tags.LinkTag; 050import org.htmlparser.tags.ObjectTag; 051import org.htmlparser.util.ParserException; 052import org.htmlparser.util.SimpleNodeIterator; 053 054/** 055 * Implements the HTML parser node visitor pattern to 056 * exchange all links on the page.<p> 057 * 058 * @since 6.0.0 059 */ 060public class CmsLinkProcessor extends CmsHtmlParser { 061 062 /** Constant for the attribute name. */ 063 public static final String ATTRIBUTE_HREF = "href"; 064 065 /** Constant for the attribute name. */ 066 public static final String ATTRIBUTE_SRC = "src"; 067 068 /** Constant for the attribute name. */ 069 public static final String ATTRIBUTE_VALUE = "value"; 070 071 /** HTML end. */ 072 public static final String HTML_END = "</body></html>"; 073 074 /** HTML start. */ 075 public static final String HTML_START = "<html><body>"; 076 077 /** Constant for the tag name. */ 078 public static final String TAG_AREA = "AREA"; 079 080 /** Constant for the tag name. */ 081 public static final String TAG_EMBED = "EMBED"; 082 083 /** Constant for the tag name. */ 084 public static final String TAG_IFRAME = "IFRAME"; 085 086 /** Constant for the tag name. */ 087 public static final String TAG_PARAM = "PARAM"; 088 089 /** List of attributes that may contain links for the embed tag. */ 090 private static final String[] EMBED_TAG_LINKED_ATTRIBS = new String[] {ATTRIBUTE_SRC, "pluginurl", "pluginspage"}; 091 092 /** List of attributes that may contain links for the object tag ("codebase" has to be first). */ 093 private static final String[] OBJECT_TAG_LINKED_ATTRIBS = new String[] {"codebase", "data", "datasrc"}; 094 095 /** Processing mode "process links". */ 096 private static final int PROCESS_LINKS = 1; 097 098 /** Processing mode "replace links". */ 099 private static final int REPLACE_LINKS = 0; 100 101 /** The current users OpenCms context, containing the users permission and site root context. */ 102 private CmsObject m_cms; 103 104 /** The selected encoding to use for parsing the HTML. */ 105 private String m_encoding; 106 107 /** The link table used for link macro replacements. */ 108 private CmsLinkTable m_linkTable; 109 110 /** Current processing mode. */ 111 private int m_mode; 112 113 /** The relative path for relative links, if not set, relative links are treated as external links. */ 114 private String m_relativePath; 115 116 /** Another OpenCms context based on the current users OpenCms context, but with the site root set to '/'. */ 117 private CmsObject m_rootCms; 118 119 /** 120 * Creates a new link processor.<p> 121 * 122 * @param cms the current users OpenCms context 123 * @param linkTable the link table to use 124 * @param encoding the encoding to use for parsing the HTML content 125 * @param relativePath additional path for links with relative path (only used in "replace" mode) 126 */ 127 public CmsLinkProcessor(CmsObject cms, CmsLinkTable linkTable, String encoding, String relativePath) { 128 129 // echo mode must be on for link processor 130 super(true); 131 132 m_cms = cms; 133 if (m_cms != null) { 134 try { 135 m_rootCms = OpenCms.initCmsObject(cms); 136 m_rootCms.getRequestContext().setSiteRoot("/"); 137 } catch (CmsException e) { 138 // this should not happen 139 m_rootCms = null; 140 } 141 } 142 m_linkTable = linkTable; 143 m_encoding = encoding; 144 m_relativePath = relativePath; 145 } 146 147 /** 148 * Escapes all <code>&</code>, e.g. replaces them with a <code>&</code>.<p> 149 * 150 * @param source the String to escape 151 * @return the escaped String 152 */ 153 public static String escapeLink(String source) { 154 155 if (source == null) { 156 return null; 157 } 158 StringBuffer result = new StringBuffer(source.length() * 2); 159 int terminatorIndex; 160 for (int i = 0; i < source.length(); ++i) { 161 char ch = source.charAt(i); 162 switch (ch) { 163 case '&': 164 // don't escape already escaped &s; 165 terminatorIndex = source.indexOf(';', i); 166 if (terminatorIndex > 0) { 167 String substr = source.substring(i + 1, terminatorIndex); 168 if ("amp".equals(substr)) { 169 result.append(ch); 170 } else { 171 result.append("&"); 172 } 173 } else { 174 result.append("&"); 175 } 176 break; 177 default: 178 result.append(ch); 179 } 180 } 181 return new String(result); 182 } 183 184 /** 185 * Unescapes all <code>&amp;</code>, that is replaces them with a <code>&</code>.<p> 186 * 187 * @param source the String to unescape 188 * @return the unescaped String 189 */ 190 public static String unescapeLink(String source) { 191 192 if (source == null) { 193 return null; 194 } 195 return CmsStringUtil.substitute(source, "&", "&"); 196 197 } 198 199 /** 200 * Returns the link table this link processor was initialized with.<p> 201 * 202 * @return the link table this link processor was initialized with 203 */ 204 public CmsLinkTable getLinkTable() { 205 206 return m_linkTable; 207 } 208 209 /** 210 * Starts link processing for the given content in processing mode.<p> 211 * 212 * Macros are replaced by links.<p> 213 * 214 * @param content the content to process 215 * @return the processed content with replaced macros 216 * 217 * @throws ParserException if something goes wrong 218 */ 219 public String processLinks(String content) throws ParserException { 220 221 m_mode = PROCESS_LINKS; 222 return process(content, m_encoding); 223 } 224 225 /** 226 * Starts link processing for the given content in replacement mode.<p> 227 * 228 * Links are replaced by macros.<p> 229 * 230 * @param content the content to process 231 * @return the processed content with replaced links 232 * 233 * @throws ParserException if something goes wrong 234 */ 235 public String replaceLinks(String content) throws ParserException { 236 237 m_mode = REPLACE_LINKS; 238 return process(content, m_encoding); 239 } 240 241 /** 242 * Visitor method to process a tag (start).<p> 243 * 244 * @param tag the tag to process 245 */ 246 @Override 247 public void visitTag(Tag tag) { 248 249 if (tag instanceof LinkTag) { 250 processLinkTag((LinkTag)tag); 251 } else if (tag instanceof ImageTag) { 252 processImageTag((ImageTag)tag); 253 } else if (tag instanceof ObjectTag) { 254 processObjectTag((ObjectTag)tag); 255 } else { 256 // there are no specialized tag classes for these tags :( 257 if (TAG_EMBED.equals(tag.getTagName())) { 258 processEmbedTag(tag); 259 } else if (TAG_AREA.equals(tag.getTagName())) { 260 processAreaTag(tag); 261 } else if (TAG_IFRAME.equals(tag.getTagName())) { 262 String src = tag.getAttribute(ATTRIBUTE_SRC); 263 if ((src != null) && !src.startsWith("//")) { 264 // link processing does not work for protocol-relative URLs, which were once used in Youtube embed 265 // codes. 266 processLink(tag, ATTRIBUTE_SRC, CmsRelationType.HYPERLINK); 267 } 268 } 269 } 270 // append text content of the tag (may have been changed by above methods) 271 super.visitTag(tag); 272 } 273 274 /** 275 * Process an area tag.<p> 276 * 277 * @param tag the tag to process 278 */ 279 protected void processAreaTag(Tag tag) { 280 281 processLink(tag, ATTRIBUTE_HREF, CmsRelationType.HYPERLINK); 282 } 283 284 /** 285 * Process an embed tag.<p> 286 * 287 * @param tag the tag to process 288 */ 289 protected void processEmbedTag(Tag tag) { 290 291 for (int i = 0; i < EMBED_TAG_LINKED_ATTRIBS.length; i++) { 292 String attr = EMBED_TAG_LINKED_ATTRIBS[i]; 293 processLink(tag, attr, CmsRelationType.EMBEDDED_OBJECT); 294 } 295 } 296 297 /** 298 * Process an image tag.<p> 299 * 300 * @param tag the tag to process 301 */ 302 protected void processImageTag(ImageTag tag) { 303 304 processLink(tag, ATTRIBUTE_SRC, CmsRelationType.valueOf(tag.getTagName())); 305 } 306 307 /** 308 * Process a tag having a link in the given attribute, considering the link as the given type.<p> 309 * 310 * @param tag the tag to process 311 * @param attr the attribute 312 * @param type the link type 313 */ 314 protected void processLink(Tag tag, String attr, CmsRelationType type) { 315 316 if (tag.getAttribute(attr) == null) { 317 return; 318 } 319 CmsLink link = null; 320 switch (m_mode) { 321 case PROCESS_LINKS: 322 // macros are replaced with links 323 link = m_linkTable.getLink(CmsMacroResolver.stripMacro(tag.getAttribute(attr))); 324 if (link != null) { 325 // link management check 326 String l = link.getLink(m_cms); 327 if (TAG_PARAM.equals(tag.getTagName())) { 328 // HACK: to distinguish link parameters the link itself has to end with '&' or '?' 329 // another solution should be a kind of macro... 330 if (!l.endsWith(CmsRequestUtil.URL_DELIMITER) 331 && !l.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) { 332 if (l.indexOf(CmsRequestUtil.URL_DELIMITER) > 0) { 333 l += CmsRequestUtil.PARAMETER_DELIMITER; 334 } else { 335 l += CmsRequestUtil.URL_DELIMITER; 336 } 337 } 338 } 339 // set the real target 340 tag.setAttribute(attr, CmsEncoder.escapeXml(l)); 341 } 342 break; 343 case REPLACE_LINKS: 344 // links are replaced with macros 345 String targetUri = tag.getAttribute(attr); 346 if (CmsStringUtil.isNotEmpty(targetUri)) { 347 String internalUri = null; 348 if (!CmsMacroResolver.isMacro(targetUri)) { 349 m_cms.getRequestContext().setAttribute( 350 CmsDefaultLinkSubstitutionHandler.DONT_USE_CURRENT_SITE_FOR_WORKPLACE_REQUESTS, 351 "true"); 352 internalUri = OpenCms.getLinkManager().getRootPath(m_cms, targetUri, m_relativePath); 353 } 354 // HACK: to distinguish link parameters the link itself has to end with '&' or '?' 355 // another solution should be a kind of macro... 356 if (!TAG_PARAM.equals(tag.getTagName()) 357 || targetUri.endsWith(CmsRequestUtil.URL_DELIMITER) 358 || targetUri.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) { 359 if (internalUri != null) { 360 internalUri = rewriteUri(internalUri); 361 // this is an internal link 362 link = m_linkTable.addLink(type, internalUri, true); 363 // link management check 364 link.checkConsistency(m_cms); 365 366 if ("IMG".equals(tag.getTagName()) || TAG_AREA.equals(tag.getTagName())) { 367 // now ensure the image has the "alt" attribute set 368 setAltAttributeFromTitle(tag, internalUri); 369 } 370 } else { 371 // this is an external link 372 link = m_linkTable.addLink(type, targetUri, false); 373 } 374 } 375 if (link != null) { 376 tag.setAttribute(attr, CmsMacroResolver.formatMacro(link.getName())); 377 } 378 } 379 break; 380 default: // empty 381 } 382 } 383 384 /** 385 * Process a link tag.<p> 386 * 387 * @param tag the tag to process 388 */ 389 protected void processLinkTag(LinkTag tag) { 390 391 processLink(tag, ATTRIBUTE_HREF, CmsRelationType.valueOf(tag.getTagName())); 392 } 393 394 /** 395 * Process an object tag.<p> 396 * 397 * @param tag the tag to process 398 */ 399 protected void processObjectTag(ObjectTag tag) { 400 401 CmsRelationType type = CmsRelationType.valueOf(tag.getTagName()); 402 for (int i = 0; i < OBJECT_TAG_LINKED_ATTRIBS.length; i++) { 403 String attr = OBJECT_TAG_LINKED_ATTRIBS[i]; 404 processLink(tag, attr, type); 405 if ((i == 0) && (tag.getAttribute(attr) != null)) { 406 // if code base is available, the other attributes are relative to it, so do not process them 407 break; 408 } 409 } 410 SimpleNodeIterator itChildren = tag.children(); 411 while (itChildren.hasMoreNodes()) { 412 Node node = itChildren.nextNode(); 413 if (node instanceof Tag) { 414 Tag childTag = (Tag)node; 415 if (TAG_PARAM.equals(childTag.getTagName())) { 416 processLink(childTag, ATTRIBUTE_VALUE, type); 417 } 418 } 419 } 420 } 421 422 /** 423 * Ensures that the given tag has the "alt" attribute set.<p> 424 * 425 * if not set, it will be set from the title of the given resource.<p> 426 * 427 * @param tag the tag to set the alt attribute for 428 * @param internalUri the internal URI to get the title from 429 */ 430 protected void setAltAttributeFromTitle(Tag tag, String internalUri) { 431 432 boolean hasAltAttrib = (tag.getAttribute("alt") != null); 433 if (!hasAltAttrib) { 434 String value = null; 435 if ((internalUri != null) && (m_rootCms != null)) { 436 // internal image: try to read the "alt" text from the "Title" property 437 try { 438 value = m_rootCms.readPropertyObject( 439 internalUri, 440 CmsPropertyDefinition.PROPERTY_TITLE, 441 false).getValue(); 442 } catch (CmsException e) { 443 // property can't be read, ignore 444 } 445 } 446 // some editors add a "/" at the end of the tag, we must make sure to insert before that 447 @SuppressWarnings("unchecked") 448 Vector<Attribute> attrs = tag.getAttributesEx(); 449 // first element is always the tag name 450 attrs.add(1, new Attribute(" ")); 451 attrs.add(2, new Attribute("alt", value == null ? "" : value, '"')); 452 } 453 } 454 455 /** 456 * Use the {@link org.opencms.file.wrapper.CmsObjectWrapper} to restore the link in the VFS.<p> 457 * 458 * @param internalUri the internal URI to restore 459 * 460 * @return the restored URI 461 */ 462 private String rewriteUri(String internalUri) { 463 464 // if an object wrapper is used, rewrite the uri 465 if (m_cms != null) { 466 Object obj = m_cms.getRequestContext().getAttribute(CmsObjectWrapper.ATTRIBUTE_NAME); 467 if (obj != null) { 468 CmsObjectWrapper wrapper = (CmsObjectWrapper)obj; 469 return wrapper.restoreLink(internalUri); 470 } 471 } 472 473 return internalUri; 474 } 475}