001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.util;
029
030import org.opencms.main.CmsLog;
031
032import java.util.Set;
033import java.util.TreeSet;
034import java.util.Vector;
035
036import org.apache.commons.logging.Log;
037
038import org.htmlparser.Attribute;
039import org.htmlparser.Node;
040import org.htmlparser.NodeFilter;
041import org.htmlparser.PrototypicalNodeFactory;
042import org.htmlparser.Tag;
043import org.htmlparser.lexer.Page;
044import org.htmlparser.scanners.Scanner;
045import org.htmlparser.util.NodeList;
046import org.htmlparser.util.ParserException;
047import org.htmlparser.util.SimpleNodeIterator;
048import org.htmlparser.visitors.NodeVisitor;
049
050/**
051 *
052 * A tag factory for htmlparser that is able to "remove tags".<p>
053 *
054 * Create an instance, add the {@link org.htmlparser.Tag} instances to remove and assign this
055 * factory to the {@link org.htmlparser.Parser} before starting a visit. A demo usage is shown in
056 * <code>CmsTagReplaceParser</code>.<p>
057 *
058 * The tags are not actually removed: They are linked in the document object model tree of the HTML
059 * that the parser generates. They just will not accept any {@link NodeVisitor} instances and
060 * therefore be invisible in any output a visitor will generate from the visited tree.<p>
061 *
062 * The decision whether a tag is removed can be controlled in two ways:
063 * <ol>
064 *  <li>
065 *   <code>{@link #addTagRemoval(Tag)}</code><br/>
066 *   <p>
067 *   The given tag will be removed ("invisible in the DOM").
068 *   </p>
069 *  </li>
070 *  <li>
071 *   <code>{@link #addTagPreserve(Tag)}</code><br/>
072 *   <p>
073 *    The given tag will be kept as-is. The following behaviour happens if this method is used:
074 *    <ol>
075 *     <li>
076 *      Once <code>{@link #addTagPreserve(Tag)}</code> has been called all Tags that are not added
077 *      to this method will be removed. <strong>We are in include mode then</strong>.
078 *     </li>
079 *     <li>
080 *      The Tags provided to <code>{@link #addTagRemoval(Tag)}</code> will only have the
081 *      power to hide exactly the same tags that are given to <code>{@link #addTagPreserve(Tag)}</code>:
082 *      <strong>Deny is stronger than allow.</strong>
083 *     </li>
084 *    </ol>
085 *   </p>
086 *  </li>
087 * </ol>
088 *
089 * @since 6.1.8
090 */
091public final class CmsHtmlTagRemoveFactory extends PrototypicalNodeFactory {
092
093    /**
094     * A Tag implementation that will not accept any {@link NodeVisitor} stopping by.<p>
095     *
096     * When visiting the corresponding tree of tags, this tag will be there but the visitor will not
097     * see it as it is not accepted. This allows "elimination" of this tag in the output the visitor
098     * generates from the document object model (e.g. HTML code again).<p>
099     *
100     * Potential child tags will be visible to visitors (unless they are instances of this class).<p>
101     *
102     * @since 6.1.8
103     */
104    private static final class CmsInvisibleTag implements Tag {
105
106        /** The real underlying tag. */
107        private Tag m_decorated;
108
109        /**
110         * Constructor with the delegate to wrap.
111         * <p>
112         *
113         * Every property is accessed transparently from the delegate, except that visitors are not
114         * welcome.
115         * <p>
116         *
117         * @param delegate the tag to hide.
118         */
119        CmsInvisibleTag(Tag delegate) {
120
121            m_decorated = delegate;
122        }
123
124        /**
125         * @see org.htmlparser.Tag#accept(org.htmlparser.visitors.NodeVisitor)
126         */
127        public void accept(NodeVisitor visitor) {
128
129            // be invisible but show the children (if they like visits)
130            NodeList children = m_decorated.getChildren();
131            if (children == null) {
132                return;
133            }
134            SimpleNodeIterator itChildren = children.elements();
135            while (itChildren.hasMoreNodes()) {
136                itChildren.nextNode().accept(visitor);
137            }
138        }
139
140        /**
141         * @see org.htmlparser.Tag#breaksFlow()
142         */
143        public boolean breaksFlow() {
144
145            return m_decorated.breaksFlow();
146        }
147
148        /**
149         * @see org.htmlparser.Node#clone()
150         */
151        @Override
152        public Object clone() throws CloneNotSupportedException {
153
154            return m_decorated.clone();
155        }
156
157        /**
158         * @see org.htmlparser.Node#collectInto(org.htmlparser.util.NodeList,
159         *      org.htmlparser.NodeFilter)
160         */
161        public void collectInto(NodeList arg0, NodeFilter arg1) {
162
163            m_decorated.collectInto(arg0, arg1);
164        }
165
166        /**
167         * @see org.htmlparser.Node#doSemanticAction()
168         */
169        public void doSemanticAction() throws ParserException {
170
171            m_decorated.doSemanticAction();
172        }
173
174        /**
175         * @see org.htmlparser.Tag#getAttribute(java.lang.String)
176         */
177        public String getAttribute(String arg0) {
178
179            return m_decorated.getAttribute(arg0);
180        }
181
182        /**
183         * @see org.htmlparser.Tag#getAttributeEx(java.lang.String)
184         */
185        public Attribute getAttributeEx(String arg0) {
186
187            return m_decorated.getAttributeEx(arg0);
188        }
189
190        /**
191         * @see org.htmlparser.Tag#getAttributesEx()
192         */
193        public Vector<Attribute> getAttributesEx() {
194
195            return m_decorated.getAttributesEx();
196        }
197
198        /**
199         * @see org.htmlparser.Node#getChildren()
200         */
201        public NodeList getChildren() {
202
203            return m_decorated.getChildren();
204        }
205
206        /**
207         * @see org.htmlparser.Tag#getEnders()
208         */
209        public String[] getEnders() {
210
211            return m_decorated.getEnders();
212        }
213
214        /**
215         * @see org.htmlparser.Tag#getEndingLineNumber()
216         */
217        public int getEndingLineNumber() {
218
219            return m_decorated.getEndingLineNumber();
220        }
221
222        /**
223         * @see org.htmlparser.Node#getEndPosition()
224         */
225        public int getEndPosition() {
226
227            return m_decorated.getEndPosition();
228        }
229
230        /**
231         * @see org.htmlparser.Tag#getEndTag()
232         */
233        public Tag getEndTag() {
234
235            return m_decorated.getEndTag();
236        }
237
238        /**
239         * @see org.htmlparser.Tag#getEndTagEnders()
240         */
241        public String[] getEndTagEnders() {
242
243            return m_decorated.getEndTagEnders();
244        }
245
246        /**
247         * @see org.htmlparser.Node#getFirstChild()
248         */
249        public Node getFirstChild() {
250
251            return m_decorated.getFirstChild();
252        }
253
254        /**
255         * @see org.htmlparser.Tag#getIds()
256         */
257        public String[] getIds() {
258
259            return m_decorated.getIds();
260        }
261
262        /**
263         * @see org.htmlparser.Node#getLastChild()
264         */
265        public Node getLastChild() {
266
267            return m_decorated.getLastChild();
268        }
269
270        /**
271         * @see org.htmlparser.Node#getNextSibling()
272         */
273        public Node getNextSibling() {
274
275            return m_decorated.getNextSibling();
276        }
277
278        /**
279         * @see org.htmlparser.Node#getPage()
280         */
281        public Page getPage() {
282
283            return m_decorated.getPage();
284        }
285
286        /**
287         * @see org.htmlparser.Node#getParent()
288         */
289        public Node getParent() {
290
291            return m_decorated.getParent();
292        }
293
294        /**
295         * @see org.htmlparser.Node#getPreviousSibling()
296         */
297        public Node getPreviousSibling() {
298
299            return m_decorated.getPreviousSibling();
300        }
301
302        /**
303         * @see org.htmlparser.Tag#getRawTagName()
304         */
305        public String getRawTagName() {
306
307            return m_decorated.getRawTagName();
308        }
309
310        /**
311         * @see org.htmlparser.Tag#getStartingLineNumber()
312         */
313        public int getStartingLineNumber() {
314
315            return m_decorated.getStartingLineNumber();
316        }
317
318        /**
319         * @see org.htmlparser.Node#getStartPosition()
320         */
321        public int getStartPosition() {
322
323            return m_decorated.getStartPosition();
324        }
325
326        /**
327         * @see org.htmlparser.Tag#getTagName()
328         */
329        public String getTagName() {
330
331            return m_decorated.getTagName();
332        }
333
334        /**
335         * @see org.htmlparser.Node#getText()
336         */
337        public String getText() {
338
339            return m_decorated.getText();
340        }
341
342        /**
343         * @see org.htmlparser.Tag#getThisScanner()
344         */
345        public Scanner getThisScanner() {
346
347            return m_decorated.getThisScanner();
348        }
349
350        /**
351         * @see org.htmlparser.Tag#isEmptyXmlTag()
352         */
353        public boolean isEmptyXmlTag() {
354
355            return m_decorated.isEmptyXmlTag();
356        }
357
358        /**
359         * @see org.htmlparser.Tag#isEndTag()
360         */
361        public boolean isEndTag() {
362
363            return m_decorated.isEndTag();
364        }
365
366        /**
367         * @see org.htmlparser.Tag#removeAttribute(java.lang.String)
368         */
369        public void removeAttribute(String arg0) {
370
371            m_decorated.removeAttribute(arg0);
372        }
373
374        /**
375         * @see org.htmlparser.Tag#setAttribute(java.lang.String, java.lang.String)
376         */
377        public void setAttribute(String arg0, String arg1) {
378
379            m_decorated.setAttribute(arg0, arg1);
380        }
381
382        /**
383         * @see org.htmlparser.Tag#setAttribute(java.lang.String, java.lang.String, char)
384         */
385        public void setAttribute(String arg0, String arg1, char arg2) {
386
387            m_decorated.setAttribute(arg0, arg1, arg2);
388        }
389
390        /**
391         * @see org.htmlparser.Tag#setAttributeEx(org.htmlparser.Attribute)
392         */
393        public void setAttributeEx(Attribute arg0) {
394
395            m_decorated.setAttributeEx(arg0);
396        }
397
398        /**
399         * @see org.htmlparser.Tag#setAttributesEx(java.util.Vector)
400         */
401        public void setAttributesEx(Vector arg0) {
402
403            m_decorated.setAttributesEx(arg0);
404        }
405
406        /**
407         * @see org.htmlparser.Node#setChildren(org.htmlparser.util.NodeList)
408         */
409        public void setChildren(NodeList arg0) {
410
411            m_decorated.setChildren(arg0);
412        }
413
414        /**
415         * @see org.htmlparser.Tag#setEmptyXmlTag(boolean)
416         */
417        public void setEmptyXmlTag(boolean arg0) {
418
419            m_decorated.setEmptyXmlTag(arg0);
420        }
421
422        /**
423         * @see org.htmlparser.Node#setEndPosition(int)
424         */
425        public void setEndPosition(int arg0) {
426
427            m_decorated.setEndPosition(arg0);
428        }
429
430        /**
431         * @see org.htmlparser.Tag#setEndTag(org.htmlparser.Tag)
432         */
433        public void setEndTag(Tag arg0) {
434
435            m_decorated.setEndTag(arg0);
436        }
437
438        /**
439         * @see org.htmlparser.Node#setPage(org.htmlparser.lexer.Page)
440         */
441        public void setPage(Page arg0) {
442
443            m_decorated.setPage(arg0);
444        }
445
446        /**
447         * @see org.htmlparser.Node#setParent(org.htmlparser.Node)
448         */
449        public void setParent(Node arg0) {
450
451            m_decorated.setParent(arg0);
452        }
453
454        /**
455         * @see org.htmlparser.Node#setStartPosition(int)
456         */
457        public void setStartPosition(int arg0) {
458
459            m_decorated.setStartPosition(arg0);
460        }
461
462        /**
463         * @see org.htmlparser.Tag#setTagName(java.lang.String)
464         */
465        public void setTagName(String arg0) {
466
467            m_decorated.setTagName(arg0);
468        }
469
470        /**
471         * @see org.htmlparser.Node#setText(java.lang.String)
472         */
473        public void setText(String arg0) {
474
475            m_decorated.setText(arg0);
476        }
477
478        /**
479         * @see org.htmlparser.Tag#setThisScanner(org.htmlparser.scanners.Scanner)
480         */
481        public void setThisScanner(Scanner arg0) {
482
483            m_decorated.setThisScanner(arg0);
484        }
485
486        /**
487         * @see org.htmlparser.Node#toHtml()
488         */
489        public String toHtml() {
490
491            return m_decorated.toHtml();
492        }
493
494        /**
495         * @see org.htmlparser.Node#toHtml(boolean)
496         */
497        public String toHtml(boolean value) {
498
499            return m_decorated.toHtml(value);
500        }
501
502        /**
503         * @see org.htmlparser.Node#toPlainTextString()
504         */
505        public String toPlainTextString() {
506
507            return m_decorated.toPlainTextString();
508        }
509
510        /**
511         * @see org.htmlparser.Node#toString()
512         */
513        @Override
514        public String toString() {
515
516            return m_decorated.toString();
517        }
518
519        /**
520         * @see org.htmlparser.Tag#toTagHtml()
521         */
522        @Override
523        public String toTagHtml() {
524
525            return m_decorated.toTagHtml();
526        }
527    }
528
529    /** The log object for this class. */
530    private static final Log LOG = CmsLog.getLog(CmsHtmlTagRemoveFactory.class);
531
532    /** Generated serial version UID. */
533    private static final long serialVersionUID = 6961158563666656633L;
534
535    /** The tags to hide tothe node visitors. */
536    private Set<String> m_invisibleTags;
537
538    /** The tags to show to the node visitors. */
539    private Set<String> m_visibleTags;
540
541    /**
542     * Create a new factory with all tags registered.
543     * <p>
544     *
545     */
546    public CmsHtmlTagRemoveFactory() {
547
548        super();
549        m_invisibleTags = new TreeSet<String>();
550        m_visibleTags = new TreeSet<String>();
551    }
552
553    /**
554     * Add a tag that will be visible for {@link NodeVisitor} instances.
555     * <p>
556     *
557     * Not only "this" tag will be visible but all parsed Tags that have the same name (case
558     * insensitive).
559     * <p>
560     *
561     * The given tag will be kept as-is. The following behaviour happens if this method is used:
562     * <ol>
563     *  <li>
564     *   Once <code>{@link #addTagPreserve(Tag)}</code> has been called all Tags that are not added
565     *   to this method will be removed. <strong>We are in include mode then</strong>.
566     *  </li>
567     *  <li>
568     *   The Tags provided to <code>{@link #addTagRemoval(Tag)}</code> will only have the
569     *   power to hide exactly the same tags that are given to <code>{@link #addTagPreserve(Tag)}</code>:
570     *   <strong>Deny is stronger than allow.</strong>
571     *  </li>
572     * </ol>
573     * <p>
574     *
575     *
576     * @param tag the tag that will be visible for all {@link NodeVisitor} instances.
577     *
578     * @return true if the tag was added to the internal set of tags to keep, false if not (was
579     *         contained before, has no name,...).
580     */
581    public boolean addTagPreserve(final Tag tag) {
582
583        boolean result = false;
584        String tagName = tag.getTagName();
585        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)) {
586            result = m_visibleTags.add(tagName.toLowerCase());
587        }
588        return result;
589
590    }
591
592    /**
593     * Add a tag that will be invisible for {@link NodeVisitor} instances.
594     * <p>
595     *
596     * Not only "this" tag will be invisible but all parsed Tags that have the same name (case
597     * insensitive).
598     * <p>
599     *
600     * @param tag the tag that will be visible for all {@link NodeVisitor} instances.
601     *
602     * @return true if the tag was added to the internal set of tags to remove, false if not (was
603     *         contained before, has no name,...).
604     */
605    public boolean addTagRemoval(final Tag tag) {
606
607        boolean result = false;
608        String tagName = tag.getTagName();
609        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)) {
610            result = m_invisibleTags.add(tagName.toLowerCase());
611        }
612        return result;
613    }
614
615    /**
616     * @see org.htmlparser.PrototypicalNodeFactory#createTagNode(org.htmlparser.lexer.Page, int,
617     *      int, java.util.Vector)
618     */
619    @Override
620    public Tag createTagNode(Page arg0, int arg1, int arg2, Vector arg3) {
621
622        try {
623            String tagName = ((Attribute)arg3.get(0)).getName().toLowerCase();
624            // end tags have names like "/a"....
625            if (tagName.charAt(0) == '/') {
626                tagName = tagName.substring(1);
627            }
628            Tag result = super.createTagNode(arg0, arg1, arg2, arg3);
629            if (!keepTag(tagName)) {
630                result = new CmsInvisibleTag(result);
631            }
632            return result;
633        } catch (RuntimeException rte) {
634            if (LOG.isErrorEnabled()) {
635                // log here, as htmlparser 1.5 did swallow exceptions from here and threw NPEs from
636                // other places
637                LOG.error(rte);
638            }
639            throw rte;
640        }
641    }
642
643    /**
644     * Encapsulation of the "preserve / remove" logic.<p>
645     *
646     * @param tagName the lower case name of the tag to keep or hide
647     *
648     * @return if true the given Tag will be kept, if false it will be removed
649     */
650    private boolean keepTag(final String tagName) {
651
652        boolean result = false;
653        // include mode:
654        if (m_visibleTags.size() > 0) {
655            if (m_visibleTags.contains(tagName)) {
656                result = true;
657            } else {
658                result = false;
659            }
660        }
661        // Power of hide: if no visible tags configured this works as a normal remove,
662        // if visible tags are configured this can change a visible tag to be invisible
663        if (m_invisibleTags.contains(tagName)) {
664            result = false;
665        }
666
667        return result;
668    }
669}