001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.staticexport;
029
030import org.opencms.file.CmsObject;
031import org.opencms.file.CmsPropertyDefinition;
032import org.opencms.file.wrapper.CmsObjectWrapper;
033import org.opencms.i18n.CmsEncoder;
034import org.opencms.main.CmsException;
035import org.opencms.main.OpenCms;
036import org.opencms.relations.CmsLink;
037import org.opencms.relations.CmsRelationType;
038import org.opencms.util.CmsHtmlParser;
039import org.opencms.util.CmsMacroResolver;
040import org.opencms.util.CmsRequestUtil;
041import org.opencms.util.CmsStringUtil;
042
043import java.util.Vector;
044
045import org.htmlparser.Attribute;
046import org.htmlparser.Node;
047import org.htmlparser.Tag;
048import org.htmlparser.tags.ImageTag;
049import org.htmlparser.tags.LinkTag;
050import org.htmlparser.tags.ObjectTag;
051import org.htmlparser.util.ParserException;
052import org.htmlparser.util.SimpleNodeIterator;
053
054/**
055 * Implements the HTML parser node visitor pattern to
056 * exchange all links on the page.<p>
057 *
058 * @since 6.0.0
059 */
060public class CmsLinkProcessor extends CmsHtmlParser {
061
062    /** Constant for the attribute name. */
063    public static final String ATTRIBUTE_HREF = "href";
064
065    /** Constant for the attribute name. */
066    public static final String ATTRIBUTE_SRC = "src";
067
068    /** Constant for the attribute name. */
069    public static final String ATTRIBUTE_VALUE = "value";
070
071    /** HTML end. */
072    public static final String HTML_END = "</body></html>";
073
074    /** HTML start. */
075    public static final String HTML_START = "<html><body>";
076
077    /** Constant for the tag name. */
078    public static final String TAG_AREA = "AREA";
079
080    /** Constant for the tag name. */
081    public static final String TAG_EMBED = "EMBED";
082
083    /** Constant for the tag name. */
084    public static final String TAG_IFRAME = "IFRAME";
085
086    /** Constant for the tag name. */
087    public static final String TAG_PARAM = "PARAM";
088
089    /** List of attributes that may contain links for the embed tag. */
090    private static final String[] EMBED_TAG_LINKED_ATTRIBS = new String[] {ATTRIBUTE_SRC, "pluginurl", "pluginspage"};
091
092    /** List of attributes that may contain links for the object tag ("codebase" has to be first). */
093    private static final String[] OBJECT_TAG_LINKED_ATTRIBS = new String[] {"codebase", "data", "datasrc"};
094
095    /** Processing mode "process links". */
096    private static final int PROCESS_LINKS = 1;
097
098    /** Processing mode "replace links". */
099    private static final int REPLACE_LINKS = 0;
100
101    /** The current users OpenCms context, containing the users permission and site root context. */
102    private CmsObject m_cms;
103
104    /** The selected encoding to use for parsing the HTML. */
105    private String m_encoding;
106
107    /** The link table used for link macro replacements. */
108    private CmsLinkTable m_linkTable;
109
110    /** Current processing mode. */
111    private int m_mode;
112
113    /** The relative path for relative links, if not set, relative links are treated as external links. */
114    private String m_relativePath;
115
116    /** Another OpenCms context based on the current users OpenCms context, but with the site root set to '/'. */
117    private CmsObject m_rootCms;
118
119    /**
120     * Creates a new link processor.<p>
121     *
122     * @param cms the current users OpenCms context
123     * @param linkTable the link table to use
124     * @param encoding the encoding to use for parsing the HTML content
125     * @param relativePath additional path for links with relative path (only used in "replace" mode)
126     */
127    public CmsLinkProcessor(CmsObject cms, CmsLinkTable linkTable, String encoding, String relativePath) {
128
129        // echo mode must be on for link processor
130        super(true);
131
132        m_cms = cms;
133        if (m_cms != null) {
134            try {
135                m_rootCms = OpenCms.initCmsObject(cms);
136                m_rootCms.getRequestContext().setSiteRoot("/");
137            } catch (CmsException e) {
138                // this should not happen
139                m_rootCms = null;
140            }
141        }
142        m_linkTable = linkTable;
143        m_encoding = encoding;
144        m_relativePath = relativePath;
145    }
146
147    /**
148     * Escapes all <code>&</code>, e.g. replaces them with a <code>&amp;</code>.<p>
149     *
150     * @param source the String to escape
151     * @return the escaped String
152     */
153    public static String escapeLink(String source) {
154
155        if (source == null) {
156            return null;
157        }
158        StringBuffer result = new StringBuffer(source.length() * 2);
159        int terminatorIndex;
160        for (int i = 0; i < source.length(); ++i) {
161            char ch = source.charAt(i);
162            switch (ch) {
163                case '&':
164                    // don't escape already escaped &amps;
165                    terminatorIndex = source.indexOf(';', i);
166                    if (terminatorIndex > 0) {
167                        String substr = source.substring(i + 1, terminatorIndex);
168                        if ("amp".equals(substr)) {
169                            result.append(ch);
170                        } else {
171                            result.append("&amp;");
172                        }
173                    } else {
174                        result.append("&amp;");
175                    }
176                    break;
177                default:
178                    result.append(ch);
179            }
180        }
181        return new String(result);
182    }
183
184    /**
185     * Unescapes all <code>&amp;amp;</code>, that is replaces them with a <code>&</code>.<p>
186     *
187     * @param source the String to unescape
188     * @return the unescaped String
189     */
190    public static String unescapeLink(String source) {
191
192        if (source == null) {
193            return null;
194        }
195        return CmsStringUtil.substitute(source, "&amp;", "&");
196
197    }
198
199    /**
200     * Returns the link table this link processor was initialized with.<p>
201     *
202     * @return the link table this link processor was initialized with
203     */
204    public CmsLinkTable getLinkTable() {
205
206        return m_linkTable;
207    }
208
209    /**
210     * Starts link processing for the given content in processing mode.<p>
211     *
212     * Macros are replaced by links.<p>
213     *
214     * @param content the content to process
215     * @return the processed content with replaced macros
216     *
217     * @throws ParserException if something goes wrong
218     */
219    public String processLinks(String content) throws ParserException {
220
221        m_mode = PROCESS_LINKS;
222        return process(content, m_encoding);
223    }
224
225    /**
226     * Starts link processing for the given content in replacement mode.<p>
227     *
228     * Links are replaced by macros.<p>
229     *
230     * @param content the content to process
231     * @return the processed content with replaced links
232     *
233     * @throws ParserException if something goes wrong
234     */
235    public String replaceLinks(String content) throws ParserException {
236
237        m_mode = REPLACE_LINKS;
238        return process(content, m_encoding);
239    }
240
241    /**
242     * Visitor method to process a tag (start).<p>
243     *
244     * @param tag the tag to process
245     */
246    @Override
247    public void visitTag(Tag tag) {
248
249        if (tag instanceof LinkTag) {
250            processLinkTag((LinkTag)tag);
251        } else if (tag instanceof ImageTag) {
252            processImageTag((ImageTag)tag);
253        } else if (tag instanceof ObjectTag) {
254            processObjectTag((ObjectTag)tag);
255        } else {
256            // there are no specialized tag classes for these tags :(
257            if (TAG_EMBED.equals(tag.getTagName())) {
258                processEmbedTag(tag);
259            } else if (TAG_AREA.equals(tag.getTagName())) {
260                processAreaTag(tag);
261            } else if (TAG_IFRAME.equals(tag.getTagName())) {
262                String src = tag.getAttribute(ATTRIBUTE_SRC);
263                if ((src != null) && !src.startsWith("//")) {
264                    // link processing does not work for protocol-relative URLs, which were once used in Youtube embed
265                    // codes.
266                    processLink(tag, ATTRIBUTE_SRC, CmsRelationType.HYPERLINK);
267                }
268            }
269        }
270        // append text content of the tag (may have been changed by above methods)
271        super.visitTag(tag);
272    }
273
274    /**
275     * Process an area tag.<p>
276     *
277     * @param tag the tag to process
278     */
279    protected void processAreaTag(Tag tag) {
280
281        processLink(tag, ATTRIBUTE_HREF, CmsRelationType.HYPERLINK);
282    }
283
284    /**
285     * Process an embed tag.<p>
286     *
287     * @param tag the tag to process
288     */
289    protected void processEmbedTag(Tag tag) {
290
291        for (int i = 0; i < EMBED_TAG_LINKED_ATTRIBS.length; i++) {
292            String attr = EMBED_TAG_LINKED_ATTRIBS[i];
293            processLink(tag, attr, CmsRelationType.EMBEDDED_OBJECT);
294        }
295    }
296
297    /**
298     * Process an image tag.<p>
299     *
300     * @param tag the tag to process
301     */
302    protected void processImageTag(ImageTag tag) {
303
304        processLink(tag, ATTRIBUTE_SRC, CmsRelationType.valueOf(tag.getTagName()));
305    }
306
307    /**
308     * Process a tag having a link in the given attribute, considering the link as the given type.<p>
309     *
310     * @param tag the tag to process
311     * @param attr the attribute
312     * @param type the link type
313     */
314    protected void processLink(Tag tag, String attr, CmsRelationType type) {
315
316        if (tag.getAttribute(attr) == null) {
317            return;
318        }
319        CmsLink link = null;
320        switch (m_mode) {
321            case PROCESS_LINKS:
322                // macros are replaced with links
323                link = m_linkTable.getLink(CmsMacroResolver.stripMacro(tag.getAttribute(attr)));
324                if (link != null) {
325                    // link management check
326                    String l = link.getLink(m_cms);
327                    if (TAG_PARAM.equals(tag.getTagName())) {
328                        // HACK: to distinguish link parameters the link itself has to end with '&' or '?'
329                        // another solution should be a kind of macro...
330                        if (!l.endsWith(CmsRequestUtil.URL_DELIMITER)
331                            && !l.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) {
332                            if (l.indexOf(CmsRequestUtil.URL_DELIMITER) > 0) {
333                                l += CmsRequestUtil.PARAMETER_DELIMITER;
334                            } else {
335                                l += CmsRequestUtil.URL_DELIMITER;
336                            }
337                        }
338                    }
339                    // set the real target
340                    tag.setAttribute(attr, CmsEncoder.escapeXml(l));
341                }
342                break;
343            case REPLACE_LINKS:
344                // links are replaced with macros
345                String targetUri = tag.getAttribute(attr);
346                if (CmsStringUtil.isNotEmpty(targetUri)) {
347                    String internalUri = null;
348                    if (!CmsMacroResolver.isMacro(targetUri)) {
349                        m_cms.getRequestContext().setAttribute(
350                            CmsDefaultLinkSubstitutionHandler.DONT_USE_CURRENT_SITE_FOR_WORKPLACE_REQUESTS,
351                            "true");
352                        internalUri = OpenCms.getLinkManager().getRootPath(m_cms, targetUri, m_relativePath);
353                    }
354                    // HACK: to distinguish link parameters the link itself has to end with '&' or '?'
355                    // another solution should be a kind of macro...
356                    if (!TAG_PARAM.equals(tag.getTagName())
357                        || targetUri.endsWith(CmsRequestUtil.URL_DELIMITER)
358                        || targetUri.endsWith(CmsRequestUtil.PARAMETER_DELIMITER)) {
359                        if (internalUri != null) {
360                            internalUri = rewriteUri(internalUri);
361                            // this is an internal link
362                            link = m_linkTable.addLink(type, internalUri, true);
363                            // link management check
364                            link.checkConsistency(m_cms);
365
366                            if ("IMG".equals(tag.getTagName()) || TAG_AREA.equals(tag.getTagName())) {
367                                // now ensure the image has the "alt" attribute set
368                                setAltAttributeFromTitle(tag, internalUri);
369                            }
370                        } else {
371                            // this is an external link
372                            link = m_linkTable.addLink(type, targetUri, false);
373                        }
374                    }
375                    if (link != null) {
376                        tag.setAttribute(attr, CmsMacroResolver.formatMacro(link.getName()));
377                    }
378                }
379                break;
380            default: // empty
381        }
382    }
383
384    /**
385     * Process a link tag.<p>
386     *
387     * @param tag the tag to process
388     */
389    protected void processLinkTag(LinkTag tag) {
390
391        processLink(tag, ATTRIBUTE_HREF, CmsRelationType.valueOf(tag.getTagName()));
392    }
393
394    /**
395     * Process an object tag.<p>
396     *
397     * @param tag the tag to process
398     */
399    protected void processObjectTag(ObjectTag tag) {
400
401        CmsRelationType type = CmsRelationType.valueOf(tag.getTagName());
402        for (int i = 0; i < OBJECT_TAG_LINKED_ATTRIBS.length; i++) {
403            String attr = OBJECT_TAG_LINKED_ATTRIBS[i];
404            processLink(tag, attr, type);
405            if ((i == 0) && (tag.getAttribute(attr) != null)) {
406                // if code base is available, the other attributes are relative to it, so do not process them
407                break;
408            }
409        }
410        SimpleNodeIterator itChildren = tag.children();
411        while (itChildren.hasMoreNodes()) {
412            Node node = itChildren.nextNode();
413            if (node instanceof Tag) {
414                Tag childTag = (Tag)node;
415                if (TAG_PARAM.equals(childTag.getTagName())) {
416                    processLink(childTag, ATTRIBUTE_VALUE, type);
417                }
418            }
419        }
420    }
421
422    /**
423     * Ensures that the given tag has the "alt" attribute set.<p>
424     *
425     * if not set, it will be set from the title of the given resource.<p>
426     *
427     * @param tag the tag to set the alt attribute for
428     * @param internalUri the internal URI to get the title from
429     */
430    protected void setAltAttributeFromTitle(Tag tag, String internalUri) {
431
432        boolean hasAltAttrib = (tag.getAttribute("alt") != null);
433        if (!hasAltAttrib) {
434            String value = null;
435            if ((internalUri != null) && (m_rootCms != null)) {
436                // internal image: try to read the "alt" text from the "Title" property
437                try {
438                    value = m_rootCms.readPropertyObject(
439                        internalUri,
440                        CmsPropertyDefinition.PROPERTY_TITLE,
441                        false).getValue();
442                } catch (CmsException e) {
443                    // property can't be read, ignore
444                }
445            }
446            // some editors add a "/" at the end of the tag, we must make sure to insert before that
447            @SuppressWarnings("unchecked")
448            Vector<Attribute> attrs = tag.getAttributesEx();
449            // first element is always the tag name
450            attrs.add(1, new Attribute(" "));
451            attrs.add(2, new Attribute("alt", value == null ? "" : value, '"'));
452        }
453    }
454
455    /**
456     * Use the {@link org.opencms.file.wrapper.CmsObjectWrapper} to restore the link in the VFS.<p>
457     *
458     * @param internalUri the internal URI to restore
459     *
460     * @return the restored URI
461     */
462    private String rewriteUri(String internalUri) {
463
464        // if an object wrapper is used, rewrite the uri
465        if (m_cms != null) {
466            Object obj = m_cms.getRequestContext().getAttribute(CmsObjectWrapper.ATTRIBUTE_NAME);
467            if (obj != null) {
468                CmsObjectWrapper wrapper = (CmsObjectWrapper)obj;
469                return wrapper.restoreLink(internalUri);
470            }
471        }
472
473        return internalUri;
474    }
475}