001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.staticexport;
029
030import org.opencms.file.CmsObject;
031import org.opencms.file.CmsPropertyDefinition;
032import org.opencms.file.wrapper.CmsObjectWrapper;
033import org.opencms.gwt.shared.CmsGwtConstants;
034import org.opencms.i18n.CmsEncoder;
035import org.opencms.main.CmsException;
036import org.opencms.main.OpenCms;
037import org.opencms.relations.CmsLink;
038import org.opencms.relations.CmsRelationType;
039import org.opencms.util.CmsHtmlParser;
040import org.opencms.util.CmsMacroResolver;
041import org.opencms.util.CmsRequestUtil;
042import org.opencms.util.CmsStringUtil;
043
044import java.util.Vector;
045
046import org.htmlparser.Attribute;
047import org.htmlparser.Node;
048import org.htmlparser.Tag;
049import org.htmlparser.tags.ImageTag;
050import org.htmlparser.tags.LinkTag;
051import org.htmlparser.tags.ObjectTag;
052import org.htmlparser.util.ParserException;
053import org.htmlparser.util.SimpleNodeIterator;
054
055/**
056 * Implements the HTML parser node visitor pattern to
057 * exchange all links on the page.<p>
058 *
059 * @since 6.0.0
060 */
061public class CmsLinkProcessor extends CmsHtmlParser {
062
063    /** Constant for the attribute name. */
064    public static final String ATTRIBUTE_HREF = "href";
065
066    /** Constant for the attribute name. */
067    public static final String ATTRIBUTE_SRC = "src";
068
069    /** Constant for the attribute name. */
070    public static final String ATTRIBUTE_VALUE = "value";
071
072    /** HTML end. */
073    public static final String HTML_END = "</body></html>";
074
075    /** HTML start. */
076    public static final String HTML_START = "<html><body>";
077
078    /** Constant for the tag name. */
079    public static final String TAG_AREA = "AREA";
080
081    /** Constant for the tag name. */
082    public static final String TAG_EMBED = "EMBED";
083
084    /** Constant for the tag name. */
085    public static final String TAG_IFRAME = "IFRAME";
086
087    /** Constant for the tag name. */
088    public static final String TAG_PARAM = "PARAM";
089
090    /** List of attributes that may contain links for the embed tag. */
091    private static final String[] EMBED_TAG_LINKED_ATTRIBS = new String[] {ATTRIBUTE_SRC, "pluginurl", "pluginspage"};
092
093    /** List of attributes that may contain links for the object tag ("codebase" has to be first). */
094    private static final String[] OBJECT_TAG_LINKED_ATTRIBS = new String[] {"codebase", "data", "datasrc"};
095
096    /** Processing mode "process links" (macros to links). */
097    private static final int PROCESS_LINKS = 1;
098
099    /** Processing mode "replace links" (links to macros).  */
100    private static final int REPLACE_LINKS = 0;
101
102    /** The current users OpenCms context, containing the users permission and site root context. */
103    private CmsObject m_cms;
104
105    /** The selected encoding to use for parsing the HTML. */
106    private String m_encoding;
107
108    /** The link table used for link macro replacements. */
109    private CmsLinkTable m_linkTable;
110
111    /** Current processing mode. */
112    private int m_mode;
113
114    /** The relative path for relative links, if not set, relative links are treated as external links. */
115    private String m_relativePath;
116
117    /** Another OpenCms context based on the current users OpenCms context, but with the site root set to '/'. */
118    private CmsObject m_rootCms;
119
120    /**
121     * Creates a new link processor.<p>
122     *
123     * @param cms the current users OpenCms context
124     * @param linkTable the link table to use
125     * @param encoding the encoding to use for parsing the HTML content
126     * @param relativePath additional path for links with relative path (only used in "replace" mode)
127     */
128    public CmsLinkProcessor(CmsObject cms, CmsLinkTable linkTable, String encoding, String relativePath) {
129
130        // echo mode must be on for link processor
131        super(true);
132
133        m_cms = cms;
134        if (m_cms != null) {
135            try {
136                m_rootCms = OpenCms.initCmsObject(cms);
137                m_rootCms.getRequestContext().setSiteRoot("/");
138            } catch (CmsException e) {
139                // this should not happen
140                m_rootCms = null;
141            }
142        }
143        m_linkTable = linkTable;
144        m_encoding = encoding;
145        m_relativePath = relativePath;
146    }
147
148    /**
149     * Escapes all <code>&</code>, e.g. replaces them with a <code>&amp;</code>.<p>
150     *
151     * @param source the String to escape
152     * @return the escaped String
153     */
154    public static String escapeLink(String source) {
155
156        if (source == null) {
157            return null;
158        }
159        StringBuffer result = new StringBuffer(source.length() * 2);
160        int terminatorIndex;
161        for (int i = 0; i < source.length(); ++i) {
162            char ch = source.charAt(i);
163            switch (ch) {
164                case '&':
165                    // don't escape already escaped &amps;
166                    terminatorIndex = source.indexOf(';', i);
167                    if (terminatorIndex > 0) {
168                        String substr = source.substring(i + 1, terminatorIndex);
169                        if ("amp".equals(substr)) {
170                            result.append(ch);
171                        } else {
172                            result.append("&amp;");
173                        }
174                    } else {
175                        result.append("&amp;");
176                    }
177                    break;
178                default:
179                    result.append(ch);
180            }
181        }
182        return new String(result);
183    }
184
185    /**
186     * Unescapes all <code>&amp;amp;</code>, that is replaces them with a <code>&</code>.<p>
187     *
188     * @param source the String to unescape
189     * @return the unescaped String
190     */
191    public static String unescapeLink(String source) {
192
193        if (source == null) {
194            return null;
195        }
196        return CmsStringUtil.substitute(source, "&amp;", "&");
197
198    }
199
200    /**
201     * Returns the link table this link processor was initialized with.<p>
202     *
203     * @return the link table this link processor was initialized with
204     */
205    public CmsLinkTable getLinkTable() {
206
207        return m_linkTable;
208    }
209
210    /**
211     * Starts link processing for the given content in processing mode.<p>
212     *
213     * Macros are replaced by links.<p>
214     *
215     * @param content the content to process
216     * @return the processed content with replaced macros
217     *
218     * @throws ParserException if something goes wrong
219     */
220    public String processLinks(String content) throws ParserException {
221
222        m_mode = PROCESS_LINKS;
223        return process(content, m_encoding);
224    }
225
226    /**
227     * Starts link processing for the given content in replacement mode.<p>
228     *
229     * Links are replaced by macros.<p>
230     *
231     * @param content the content to process
232     * @return the processed content with replaced links
233     *
234     * @throws ParserException if something goes wrong
235     */
236    public String replaceLinks(String content) throws ParserException {
237
238        m_mode = REPLACE_LINKS;
239        return process(content, m_encoding);
240    }
241
242    /**
243     * Visitor method to process a tag (start).<p>
244     *
245     * @param tag the tag to process
246     */
247    @Override
248    public void visitTag(Tag tag) {
249
250        if (tag instanceof LinkTag) {
251            processLinkTag((LinkTag)tag);
252        } else if (tag instanceof ImageTag) {
253            processImageTag((ImageTag)tag);
254        } else if (tag instanceof ObjectTag) {
255            processObjectTag((ObjectTag)tag);
256        } else {
257            // there are no specialized tag classes for these tags :(
258            if (TAG_EMBED.equals(tag.getTagName())) {
259                processEmbedTag(tag);
260            } else if (TAG_AREA.equals(tag.getTagName())) {
261                processAreaTag(tag);
262            } else if (TAG_IFRAME.equals(tag.getTagName())) {
263                String src = tag.getAttribute(ATTRIBUTE_SRC);
264                if ((src != null) && !src.startsWith("//")) {
265                    // link processing does not work for protocol-relative URLs, which were once used in Youtube embed
266                    // codes.
267                    processLink(tag, ATTRIBUTE_SRC, CmsRelationType.HYPERLINK);
268                }
269            }
270        }
271        // append text content of the tag (may have been changed by above methods)
272        super.visitTag(tag);
273    }
274
275    /**
276     * Process an area tag.<p>
277     *
278     * @param tag the tag to process
279     */
280    protected void processAreaTag(Tag tag) {
281
282        processLink(tag, ATTRIBUTE_HREF, CmsRelationType.HYPERLINK);
283    }
284
285    /**
286     * Process an embed tag.<p>
287     *
288     * @param tag the tag to process
289     */
290    protected void processEmbedTag(Tag tag) {
291
292        for (int i = 0; i < EMBED_TAG_LINKED_ATTRIBS.length; i++) {
293            String attr = EMBED_TAG_LINKED_ATTRIBS[i];
294            processLink(tag, attr, CmsRelationType.EMBEDDED_OBJECT);
295        }
296    }
297
298    /**
299     * Process an image tag.<p>
300     *
301     * @param tag the tag to process
302     */
303    protected void processImageTag(ImageTag tag) {
304
305        processLink(tag, ATTRIBUTE_SRC, CmsRelationType.valueOf(tag.getTagName()));
306    }
307
308    /**
309     * Process a tag having a link in the given attribute, considering the link as the given type.<p>
310     *
311     * @param tag the tag to process
312     * @param attr the attribute
313     * @param type the link type
314     */
315    protected void processLink(Tag tag, String attr, CmsRelationType type) {
316
317        if (tag.getAttribute(attr) == null) {
318            return;
319        }
320        CmsLink link = null;
321
322        switch (m_mode) {
323            case PROCESS_LINKS:
324                // macros are replaced with links
325                link = m_linkTable.getLink(CmsMacroResolver.stripMacro(tag.getAttribute(attr)));
326                if (link != null) {
327                    // link management check
328                    String l = link.getLink(m_cms);
329                    if (TAG_PARAM.equals(tag.getTagName())) {
330                        // HACK: to distinguish link parameters the link itself has to end with '&' or '?'
331                        // another solution should be a kind of macro...
332                        if (!l.endsWith(CmsRequestUtil.URL_DELIMITER)
333                            && !l.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) {
334                            if (l.indexOf(CmsRequestUtil.URL_DELIMITER) > 0) {
335                                l += CmsRequestUtil.PARAMETER_DELIMITER;
336                            } else {
337                                l += CmsRequestUtil.URL_DELIMITER;
338                            }
339                        }
340                    }
341                    // set the real target
342                    tag.setAttribute(attr, CmsEncoder.escapeXml(l));
343
344                    // In the Online project, remove href attributes with broken links from A tags.
345                    if (tag.getTagName().equalsIgnoreCase("A")
346                        && m_cms.getRequestContext().isOnlineOrEditDisabled()
347                        && link.isInternal()
348                        && (link.getResource() == null)) {
349                        // getResource() == null could either mean checkConsistency has not been called, or that the link is broken.
350                        // so we have to call checkConsistency to eliminate the first possibility.
351                        link.checkConsistency(m_cms);
352                        if (link.getResource() == null) {
353                            tag.removeAttribute(ATTRIBUTE_HREF);
354                            tag.setAttribute(CmsGwtConstants.ATTR_DEAD_LINK_MARKER, "true");
355                        }
356                    }
357                }
358                break;
359            case REPLACE_LINKS:
360                // links are replaced with macros
361                String targetUri = tag.getAttribute(attr);
362                if (CmsStringUtil.isNotEmpty(targetUri)) {
363                    String internalUri = null;
364                    if (!CmsMacroResolver.isMacro(targetUri)) {
365                        m_cms.getRequestContext().setAttribute(
366                            CmsDefaultLinkSubstitutionHandler.DONT_USE_CURRENT_SITE_FOR_WORKPLACE_REQUESTS,
367                            "true");
368                        internalUri = OpenCms.getLinkManager().getRootPath(m_cms, targetUri, m_relativePath);
369                    }
370                    // HACK: to distinguish link parameters the link itself has to end with '&' or '?'
371                    // another solution should be a kind of macro...
372                    if (!TAG_PARAM.equals(tag.getTagName())
373                        || targetUri.endsWith(CmsRequestUtil.URL_DELIMITER)
374                        || targetUri.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) {
375                        if (internalUri != null) {
376                            internalUri = rewriteUri(internalUri);
377                            // this is an internal link
378                            link = m_linkTable.addLink(type, internalUri, true);
379                            // link management check
380                            link.checkConsistency(m_cms);
381
382                            if ("IMG".equals(tag.getTagName()) || TAG_AREA.equals(tag.getTagName())) {
383                                // now ensure the image has the "alt" attribute set
384                                setAltAttributeFromTitle(tag, internalUri);
385                            }
386                        } else {
387                            // this is an external link
388                            link = m_linkTable.addLink(type, targetUri, false);
389                        }
390                    }
391                    if (link != null) {
392                        tag.setAttribute(attr, CmsMacroResolver.formatMacro(link.getName()));
393                    }
394                }
395                break;
396            default: // empty
397        }
398    }
399
400    /**
401     * Process a link tag.<p>
402     *
403     * @param tag the tag to process
404     */
405    protected void processLinkTag(LinkTag tag) {
406
407        processLink(tag, ATTRIBUTE_HREF, CmsRelationType.valueOf(tag.getTagName()));
408    }
409
410    /**
411     * Process an object tag.<p>
412     *
413     * @param tag the tag to process
414     */
415    protected void processObjectTag(ObjectTag tag) {
416
417        CmsRelationType type = CmsRelationType.valueOf(tag.getTagName());
418        for (int i = 0; i < OBJECT_TAG_LINKED_ATTRIBS.length; i++) {
419            String attr = OBJECT_TAG_LINKED_ATTRIBS[i];
420            processLink(tag, attr, type);
421            if ((i == 0) && (tag.getAttribute(attr) != null)) {
422                // if code base is available, the other attributes are relative to it, so do not process them
423                break;
424            }
425        }
426        SimpleNodeIterator itChildren = tag.children();
427        while (itChildren.hasMoreNodes()) {
428            Node node = itChildren.nextNode();
429            if (node instanceof Tag) {
430                Tag childTag = (Tag)node;
431                if (TAG_PARAM.equals(childTag.getTagName())) {
432                    processLink(childTag, ATTRIBUTE_VALUE, type);
433                }
434            }
435        }
436    }
437
438    /**
439     * Ensures that the given tag has the "alt" attribute set.<p>
440     *
441     * if not set, it will be set from the title of the given resource.<p>
442     *
443     * @param tag the tag to set the alt attribute for
444     * @param internalUri the internal URI to get the title from
445     */
446    protected void setAltAttributeFromTitle(Tag tag, String internalUri) {
447
448        boolean hasAltAttrib = (tag.getAttribute("alt") != null);
449        if (!hasAltAttrib) {
450            String value = null;
451            if ((internalUri != null) && (m_rootCms != null)) {
452                // internal image: try to read the "alt" text from the "Title" property
453                try {
454                    value = m_rootCms.readPropertyObject(
455                        internalUri,
456                        CmsPropertyDefinition.PROPERTY_TITLE,
457                        false).getValue();
458                } catch (CmsException e) {
459                    // property can't be read, ignore
460                }
461            }
462            // some editors add a "/" at the end of the tag, we must make sure to insert before that
463            @SuppressWarnings("unchecked")
464            Vector<Attribute> attrs = tag.getAttributesEx();
465            // first element is always the tag name
466            attrs.add(1, new Attribute(" "));
467            attrs.add(2, new Attribute("alt", value == null ? "" : value, '"'));
468        }
469    }
470
471    /**
472     * Use the {@link org.opencms.file.wrapper.CmsObjectWrapper} to restore the link in the VFS.<p>
473     *
474     * @param internalUri the internal URI to restore
475     *
476     * @return the restored URI
477     */
478    private String rewriteUri(String internalUri) {
479
480        // if an object wrapper is used, rewrite the uri
481        if (m_cms != null) {
482            Object obj = m_cms.getRequestContext().getAttribute(CmsObjectWrapper.ATTRIBUTE_NAME);
483            if (obj != null) {
484                CmsObjectWrapper wrapper = (CmsObjectWrapper)obj;
485                return wrapper.restoreLink(internalUri);
486            }
487        }
488
489        return internalUri;
490    }
491}