001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.staticexport;
029
030import org.opencms.file.CmsObject;
031import org.opencms.file.CmsPropertyDefinition;
032import org.opencms.file.wrapper.CmsObjectWrapper;
033import org.opencms.gwt.shared.CmsGwtConstants;
034import org.opencms.i18n.CmsEncoder;
035import org.opencms.main.CmsException;
036import org.opencms.main.OpenCms;
037import org.opencms.relations.CmsLink;
038import org.opencms.relations.CmsRelationType;
039import org.opencms.util.CmsHtmlParser;
040import org.opencms.util.CmsMacroResolver;
041import org.opencms.util.CmsRequestUtil;
042import org.opencms.util.CmsStringUtil;
043import org.opencms.util.CmsUUID;
044
045import java.util.Vector;
046
047import org.htmlparser.Attribute;
048import org.htmlparser.Node;
049import org.htmlparser.Tag;
050import org.htmlparser.tags.ImageTag;
051import org.htmlparser.tags.LinkTag;
052import org.htmlparser.tags.ObjectTag;
053import org.htmlparser.util.ParserException;
054import org.htmlparser.util.SimpleNodeIterator;
055
056/**
057 * Implements the HTML parser node visitor pattern to
058 * exchange all links on the page.<p>
059 *
060 * @since 6.0.0
061 */
062public class CmsLinkProcessor extends CmsHtmlParser {
063
064    /** Constant for the attribute name. */
065    public static final String ATTRIBUTE_HREF = "href";
066
067    /** Constant for the attribute name. */
068    public static final String ATTRIBUTE_SRC = "src";
069
070    /** Constant for the attribute name. */
071    public static final String ATTRIBUTE_VALUE = "value";
072
073    /** HTML end. */
074    public static final String HTML_END = "</body></html>";
075
076    /** HTML start. */
077    public static final String HTML_START = "<html><body>";
078
079    /** Constant for the tag name. */
080    public static final String TAG_AREA = "AREA";
081
082    /** Constant for the tag name. */
083    public static final String TAG_EMBED = "EMBED";
084
085    /** Constant for the tag name. */
086    public static final String TAG_IFRAME = "IFRAME";
087
088    /** Constant for the tag name. */
089    public static final String TAG_PARAM = "PARAM";
090
091    /** List of attributes that may contain links for the embed tag. */
092    private static final String[] EMBED_TAG_LINKED_ATTRIBS = new String[] {ATTRIBUTE_SRC, "pluginurl", "pluginspage"};
093
094    /** List of attributes that may contain links for the object tag ("codebase" has to be first). */
095    private static final String[] OBJECT_TAG_LINKED_ATTRIBS = new String[] {"codebase", "data", "datasrc"};
096
097    /** Processing mode "process links" (macros to links). */
098    private static final int PROCESS_LINKS = 1;
099
100    /** Processing mode "replace links" (links to macros).  */
101    private static final int REPLACE_LINKS = 0;
102
103    /** The current users OpenCms context, containing the users permission and site root context. */
104    private CmsObject m_cms;
105
106    /** The selected encoding to use for parsing the HTML. */
107    private String m_encoding;
108
109    /** The link table used for link macro replacements. */
110    private CmsLinkTable m_linkTable;
111
112    /** Current processing mode. */
113    private int m_mode;
114
115    /** The relative path for relative links, if not set, relative links are treated as external links. */
116    private String m_relativePath;
117
118    /** Another OpenCms context based on the current users OpenCms context, but with the site root set to '/'. */
119    private CmsObject m_rootCms;
120
121    /**
122     * Creates a new link processor.<p>
123     *
124     * @param cms the current users OpenCms context
125     * @param linkTable the link table to use
126     * @param encoding the encoding to use for parsing the HTML content
127     * @param relativePath additional path for links with relative path (only used in "replace" mode)
128     */
129    public CmsLinkProcessor(CmsObject cms, CmsLinkTable linkTable, String encoding, String relativePath) {
130
131        // echo mode must be on for link processor
132        super(true);
133
134        m_cms = cms;
135        if (m_cms != null) {
136            try {
137                m_rootCms = OpenCms.initCmsObject(cms);
138                m_rootCms.getRequestContext().setSiteRoot("/");
139            } catch (CmsException e) {
140                // this should not happen
141                m_rootCms = null;
142            }
143        }
144        m_linkTable = linkTable;
145        m_encoding = encoding;
146        m_relativePath = relativePath;
147    }
148
149    /**
150     * Escapes all <code>&</code>, e.g. replaces them with a <code>&amp;</code>.<p>
151     *
152     * @param source the String to escape
153     * @return the escaped String
154     */
155    public static String escapeLink(String source) {
156
157        if (source == null) {
158            return null;
159        }
160        StringBuffer result = new StringBuffer(source.length() * 2);
161        int terminatorIndex;
162        for (int i = 0; i < source.length(); ++i) {
163            char ch = source.charAt(i);
164            switch (ch) {
165                case '&':
166                    // don't escape already escaped &amps;
167                    terminatorIndex = source.indexOf(';', i);
168                    if (terminatorIndex > 0) {
169                        String substr = source.substring(i + 1, terminatorIndex);
170                        if ("amp".equals(substr)) {
171                            result.append(ch);
172                        } else {
173                            result.append("&amp;");
174                        }
175                    } else {
176                        result.append("&amp;");
177                    }
178                    break;
179                default:
180                    result.append(ch);
181            }
182        }
183        return new String(result);
184    }
185
186    /**
187     * Unescapes all <code>&amp;amp;</code>, that is replaces them with a <code>&</code>.<p>
188     *
189     * @param source the String to unescape
190     * @return the unescaped String
191     */
192    public static String unescapeLink(String source) {
193
194        if (source == null) {
195            return null;
196        }
197        return CmsStringUtil.substitute(source, "&amp;", "&");
198
199    }
200
201    /**
202     * Returns the link table this link processor was initialized with.<p>
203     *
204     * @return the link table this link processor was initialized with
205     */
206    public CmsLinkTable getLinkTable() {
207
208        return m_linkTable;
209    }
210
211    /**
212     * Starts link processing for the given content in processing mode.<p>
213     *
214     * Macros are replaced by links.<p>
215     *
216     * @param content the content to process
217     * @return the processed content with replaced macros
218     *
219     * @throws ParserException if something goes wrong
220     */
221    public String processLinks(String content) throws ParserException {
222
223        m_mode = PROCESS_LINKS;
224        return process(content, m_encoding);
225    }
226
227    /**
228     * Starts link processing for the given content in replacement mode.<p>
229     *
230     * Links are replaced by macros.<p>
231     *
232     * @param content the content to process
233     * @return the processed content with replaced links
234     *
235     * @throws ParserException if something goes wrong
236     */
237    public String replaceLinks(String content) throws ParserException {
238
239        m_mode = REPLACE_LINKS;
240        return process(content, m_encoding);
241    }
242
243    /**
244     * Visitor method to process a tag (start).<p>
245     *
246     * @param tag the tag to process
247     */
248    @Override
249    public void visitTag(Tag tag) {
250
251        if (tag instanceof LinkTag) {
252            processLinkTag((LinkTag)tag);
253        } else if (tag instanceof ImageTag) {
254            processImageTag((ImageTag)tag);
255        } else if (tag instanceof ObjectTag) {
256            processObjectTag((ObjectTag)tag);
257        } else {
258            // there are no specialized tag classes for these tags :(
259            if (TAG_EMBED.equals(tag.getTagName())) {
260                processEmbedTag(tag);
261            } else if (TAG_AREA.equals(tag.getTagName())) {
262                processAreaTag(tag);
263            } else if (TAG_IFRAME.equals(tag.getTagName())) {
264                String src = tag.getAttribute(ATTRIBUTE_SRC);
265                if ((src != null) && !src.startsWith("//")) {
266                    // link processing does not work for protocol-relative URLs, which were once used in Youtube embed
267                    // codes.
268                    processLink(tag, ATTRIBUTE_SRC, CmsRelationType.HYPERLINK);
269                }
270            }
271        }
272        // append text content of the tag (may have been changed by above methods)
273        super.visitTag(tag);
274    }
275
276    /**
277     * Process an area tag.<p>
278     *
279     * @param tag the tag to process
280     */
281    protected void processAreaTag(Tag tag) {
282
283        processLink(tag, ATTRIBUTE_HREF, CmsRelationType.HYPERLINK);
284    }
285
286    /**
287     * Process an embed tag.<p>
288     *
289     * @param tag the tag to process
290     */
291    protected void processEmbedTag(Tag tag) {
292
293        for (int i = 0; i < EMBED_TAG_LINKED_ATTRIBS.length; i++) {
294            String attr = EMBED_TAG_LINKED_ATTRIBS[i];
295            processLink(tag, attr, CmsRelationType.EMBEDDED_OBJECT);
296        }
297    }
298
299    /**
300     * Process an image tag.<p>
301     *
302     * @param tag the tag to process
303     */
304    protected void processImageTag(ImageTag tag) {
305
306        processLink(tag, ATTRIBUTE_SRC, CmsRelationType.valueOf(tag.getTagName()));
307    }
308
309    /**
310     * Process a tag having a link in the given attribute, considering the link as the given type.<p>
311     *
312     * @param tag the tag to process
313     * @param attr the attribute
314     * @param type the link type
315     */
316    protected void processLink(Tag tag, String attr, CmsRelationType type) {
317
318        if (tag.getAttribute(attr) == null) {
319            return;
320        }
321        CmsLink link = null;
322
323        switch (m_mode) {
324            case PROCESS_LINKS:
325                // macros are replaced with links
326                link = m_linkTable.getLink(CmsMacroResolver.stripMacro(tag.getAttribute(attr)));
327                if (link != null) {
328                    // link management check
329                    String l = link.getLink(m_cms);
330                    if (TAG_PARAM.equals(tag.getTagName())) {
331                        // HACK: to distinguish link parameters the link itself has to end with '&' or '?'
332                        // another solution should be a kind of macro...
333                        if (!l.endsWith(CmsRequestUtil.URL_DELIMITER)
334                            && !l.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) {
335                            if (l.indexOf(CmsRequestUtil.URL_DELIMITER) > 0) {
336                                l += CmsRequestUtil.PARAMETER_DELIMITER;
337                            } else {
338                                l += CmsRequestUtil.URL_DELIMITER;
339                            }
340                        }
341                    }
342                    // set the real target
343                    tag.setAttribute(attr, CmsEncoder.escapeXml(l));
344
345                    // In the Online project, remove href attributes with broken links from A tags.
346                    // Exception: We don't do this if the target is empty, because fragment links ('#anchor')
347                    // in the WYSIWYG editor are stored as internal links with empty targets
348                    if (tag.getTagName().equalsIgnoreCase("A")
349                        && m_cms.getRequestContext().isOnlineOrEditDisabled()
350                        && link.isInternal()
351                        && !CmsStringUtil.isEmpty(link.getTarget())
352                        && (link.getResource() == null)) {
353
354                        // getResource() == null could either mean checkConsistency has not been called, or that the link is broken.
355                        // so we have to call checkConsistency to eliminate the first possibility.
356                        link.checkConsistency(m_cms);
357                        // The consistency check tries to read the resource by id first, and then by path if this fails. If at some point in this process
358                        // we get a security exception, then there must be some resource there, either for the given id or for the path, although we don't
359                        // know at this point in the code which one it is. But it doesn't matter; because a potential link target exists, we don't remove the link.
360                        if ((link.getResource() == null)
361                            && !CmsUUID.getNullUUID().equals(
362                                link.getStructureId()) /* 00000000-0000-0000-0000-000000000000 corresponds to static resource served from Jar file. We probably don't need that in the Online project, but we don't need to actively remove that, either. */
363                            && !link.hadSecurityErrorDuringLastConsistencyCheck()) {
364                            tag.removeAttribute(ATTRIBUTE_HREF);
365                            tag.setAttribute(CmsGwtConstants.ATTR_DEAD_LINK_MARKER, "true");
366                        }
367                    }
368                }
369                break;
370            case REPLACE_LINKS:
371                // links are replaced with macros
372                String targetUri = tag.getAttribute(attr);
373                if (CmsStringUtil.isNotEmpty(targetUri)) {
374                    String internalUri = null;
375                    if (!CmsMacroResolver.isMacro(targetUri)) {
376                        m_cms.getRequestContext().setAttribute(
377                            CmsDefaultLinkSubstitutionHandler.DONT_USE_CURRENT_SITE_FOR_WORKPLACE_REQUESTS,
378                            "true");
379                        internalUri = OpenCms.getLinkManager().getRootPath(m_cms, targetUri, m_relativePath);
380                    }
381                    // HACK: to distinguish link parameters the link itself has to end with '&' or '?'
382                    // another solution should be a kind of macro...
383                    if (!TAG_PARAM.equals(tag.getTagName())
384                        || targetUri.endsWith(CmsRequestUtil.URL_DELIMITER)
385                        || targetUri.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) {
386                        if (internalUri != null) {
387                            internalUri = rewriteUri(internalUri);
388                            // this is an internal link
389                            link = m_linkTable.addLink(type, internalUri, true);
390                            // link management check
391                            link.checkConsistency(m_cms);
392
393                            if ("IMG".equals(tag.getTagName()) || TAG_AREA.equals(tag.getTagName())) {
394                                // now ensure the image has the "alt" attribute set
395                                setAltAttributeFromTitle(tag, internalUri);
396                            }
397                        } else {
398                            // this is an external link
399                            link = m_linkTable.addLink(type, targetUri, false);
400                        }
401                    }
402                    if (link != null) {
403                        tag.setAttribute(attr, CmsMacroResolver.formatMacro(link.getName()));
404                    }
405                }
406                break;
407            default: // empty
408        }
409    }
410
411    /**
412     * Process a link tag.<p>
413     *
414     * @param tag the tag to process
415     */
416    protected void processLinkTag(LinkTag tag) {
417
418        processLink(tag, ATTRIBUTE_HREF, CmsRelationType.valueOf(tag.getTagName()));
419    }
420
421    /**
422     * Process an object tag.<p>
423     *
424     * @param tag the tag to process
425     */
426    protected void processObjectTag(ObjectTag tag) {
427
428        CmsRelationType type = CmsRelationType.valueOf(tag.getTagName());
429        for (int i = 0; i < OBJECT_TAG_LINKED_ATTRIBS.length; i++) {
430            String attr = OBJECT_TAG_LINKED_ATTRIBS[i];
431            processLink(tag, attr, type);
432            if ((i == 0) && (tag.getAttribute(attr) != null)) {
433                // if code base is available, the other attributes are relative to it, so do not process them
434                break;
435            }
436        }
437        SimpleNodeIterator itChildren = tag.children();
438        while (itChildren.hasMoreNodes()) {
439            Node node = itChildren.nextNode();
440            if (node instanceof Tag) {
441                Tag childTag = (Tag)node;
442                if (TAG_PARAM.equals(childTag.getTagName())) {
443                    processLink(childTag, ATTRIBUTE_VALUE, type);
444                }
445            }
446        }
447    }
448
449    /**
450     * Ensures that the given tag has the "alt" attribute set.<p>
451     *
452     * if not set, it will be set from the title of the given resource.<p>
453     *
454     * @param tag the tag to set the alt attribute for
455     * @param internalUri the internal URI to get the title from
456     */
457    protected void setAltAttributeFromTitle(Tag tag, String internalUri) {
458
459        boolean hasAltAttrib = (tag.getAttribute("alt") != null);
460        if (!hasAltAttrib) {
461            String value = null;
462            if ((internalUri != null) && (m_rootCms != null)) {
463                // internal image: try to read the "alt" text from the "Title" property
464                try {
465                    value = m_rootCms.readPropertyObject(
466                        internalUri,
467                        CmsPropertyDefinition.PROPERTY_TITLE,
468                        false).getValue();
469                } catch (CmsException e) {
470                    // property can't be read, ignore
471                }
472            }
473            // some editors add a "/" at the end of the tag, we must make sure to insert before that
474            @SuppressWarnings("unchecked")
475            Vector<Attribute> attrs = tag.getAttributesEx();
476            // first element is always the tag name
477            attrs.add(1, new Attribute(" "));
478            attrs.add(2, new Attribute("alt", value == null ? "" : value, '"'));
479        }
480    }
481
482    /**
483     * Use the {@link org.opencms.file.wrapper.CmsObjectWrapper} to restore the link in the VFS.<p>
484     *
485     * @param internalUri the internal URI to restore
486     *
487     * @return the restored URI
488     */
489    private String rewriteUri(String internalUri) {
490
491        // if an object wrapper is used, rewrite the uri
492        if (m_cms != null) {
493            Object obj = m_cms.getRequestContext().getAttribute(CmsObjectWrapper.ATTRIBUTE_NAME);
494            if (obj != null) {
495                CmsObjectWrapper wrapper = (CmsObjectWrapper)obj;
496                return wrapper.restoreLink(internalUri);
497            }
498        }
499
500        return internalUri;
501    }
502}