001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030import org.opencms.main.CmsLog; 031 032import java.util.Set; 033import java.util.TreeSet; 034import java.util.Vector; 035 036import org.apache.commons.logging.Log; 037 038import org.htmlparser.Attribute; 039import org.htmlparser.Node; 040import org.htmlparser.NodeFilter; 041import org.htmlparser.PrototypicalNodeFactory; 042import org.htmlparser.Tag; 043import org.htmlparser.lexer.Page; 044import org.htmlparser.scanners.Scanner; 045import org.htmlparser.util.NodeList; 046import org.htmlparser.util.ParserException; 047import org.htmlparser.util.SimpleNodeIterator; 048import org.htmlparser.visitors.NodeVisitor; 049 050/** 051 * 052 * A tag factory for htmlparser that is able to "remove tags".<p> 053 * 054 * Create an instance, add the {@link org.htmlparser.Tag} instances to remove and assign this 055 * factory to the {@link org.htmlparser.Parser} before starting a visit. A demo usage is shown in 056 * <code>CmsTagReplaceParser</code>.<p> 057 * 058 * The tags are not actually removed: They are linked in the document object model tree of the HTML 059 * that the parser generates. They just will not accept any {@link NodeVisitor} instances and 060 * therefore be invisible in any output a visitor will generate from the visited tree.<p> 061 * 062 * The decision whether a tag is removed can be controlled in two ways: 063 * <ol> 064 * <li> 065 * <code>{@link #addTagRemoval(Tag)}</code><br/> 066 * <p> 067 * The given tag will be removed ("invisible in the DOM"). 068 * </p> 069 * </li> 070 * <li> 071 * <code>{@link #addTagPreserve(Tag)}</code><br/> 072 * <p> 073 * The given tag will be kept as-is. The following behaviour happens if this method is used: 074 * <ol> 075 * <li> 076 * Once <code>{@link #addTagPreserve(Tag)}</code> has been called all Tags that are not added 077 * to this method will be removed. <strong>We are in include mode then</strong>. 078 * </li> 079 * <li> 080 * The Tags provided to <code>{@link #addTagRemoval(Tag)}</code> will only have the 081 * power to hide exactly the same tags that are given to <code>{@link #addTagPreserve(Tag)}</code>: 082 * <strong>Deny is stronger than allow.</strong> 083 * </li> 084 * </ol> 085 * </p> 086 * </li> 087 * </ol> 088 * 089 * @since 6.1.8 090 */ 091public final class CmsHtmlTagRemoveFactory extends PrototypicalNodeFactory { 092 093 /** 094 * A Tag implementation that will not accept any {@link NodeVisitor} stopping by.<p> 095 * 096 * When visiting the corresponding tree of tags, this tag will be there but the visitor will not 097 * see it as it is not accepted. This allows "elimination" of this tag in the output the visitor 098 * generates from the document object model (e.g. HTML code again).<p> 099 * 100 * Potential child tags will be visible to visitors (unless they are instances of this class).<p> 101 * 102 * @since 6.1.8 103 */ 104 private static final class CmsInvisibleTag implements Tag { 105 106 /** The real underlying tag. */ 107 private Tag m_decorated; 108 109 /** 110 * Constructor with the delegate to wrap. 111 * <p> 112 * 113 * Every property is accessed transparently from the delegate, except that visitors are not 114 * welcome. 115 * <p> 116 * 117 * @param delegate the tag to hide. 118 */ 119 CmsInvisibleTag(Tag delegate) { 120 121 m_decorated = delegate; 122 } 123 124 /** 125 * @see org.htmlparser.Tag#accept(org.htmlparser.visitors.NodeVisitor) 126 */ 127 public void accept(NodeVisitor visitor) { 128 129 // be invisible but show the children (if they like visits) 130 NodeList children = m_decorated.getChildren(); 131 if (children == null) { 132 return; 133 } 134 SimpleNodeIterator itChildren = children.elements(); 135 while (itChildren.hasMoreNodes()) { 136 itChildren.nextNode().accept(visitor); 137 } 138 } 139 140 /** 141 * @see org.htmlparser.Tag#breaksFlow() 142 */ 143 public boolean breaksFlow() { 144 145 return m_decorated.breaksFlow(); 146 } 147 148 /** 149 * @see org.htmlparser.Node#clone() 150 */ 151 @Override 152 public Object clone() throws CloneNotSupportedException { 153 154 return m_decorated.clone(); 155 } 156 157 /** 158 * @see org.htmlparser.Node#collectInto(org.htmlparser.util.NodeList, 159 * org.htmlparser.NodeFilter) 160 */ 161 public void collectInto(NodeList arg0, NodeFilter arg1) { 162 163 m_decorated.collectInto(arg0, arg1); 164 } 165 166 /** 167 * @see org.htmlparser.Node#doSemanticAction() 168 */ 169 public void doSemanticAction() throws ParserException { 170 171 m_decorated.doSemanticAction(); 172 } 173 174 /** 175 * @see org.htmlparser.Tag#getAttribute(java.lang.String) 176 */ 177 public String getAttribute(String arg0) { 178 179 return m_decorated.getAttribute(arg0); 180 } 181 182 /** 183 * @see org.htmlparser.Tag#getAttributeEx(java.lang.String) 184 */ 185 public Attribute getAttributeEx(String arg0) { 186 187 return m_decorated.getAttributeEx(arg0); 188 } 189 190 /** 191 * @see org.htmlparser.Tag#getAttributesEx() 192 */ 193 public Vector<Attribute> getAttributesEx() { 194 195 return m_decorated.getAttributesEx(); 196 } 197 198 /** 199 * @see org.htmlparser.Node#getChildren() 200 */ 201 public NodeList getChildren() { 202 203 return m_decorated.getChildren(); 204 } 205 206 /** 207 * @see org.htmlparser.Tag#getEnders() 208 */ 209 public String[] getEnders() { 210 211 return m_decorated.getEnders(); 212 } 213 214 /** 215 * @see org.htmlparser.Tag#getEndingLineNumber() 216 */ 217 public int getEndingLineNumber() { 218 219 return m_decorated.getEndingLineNumber(); 220 } 221 222 /** 223 * @see org.htmlparser.Node#getEndPosition() 224 */ 225 public int getEndPosition() { 226 227 return m_decorated.getEndPosition(); 228 } 229 230 /** 231 * @see org.htmlparser.Tag#getEndTag() 232 */ 233 public Tag getEndTag() { 234 235 return m_decorated.getEndTag(); 236 } 237 238 /** 239 * @see org.htmlparser.Tag#getEndTagEnders() 240 */ 241 public String[] getEndTagEnders() { 242 243 return m_decorated.getEndTagEnders(); 244 } 245 246 /** 247 * @see org.htmlparser.Node#getFirstChild() 248 */ 249 public Node getFirstChild() { 250 251 return m_decorated.getFirstChild(); 252 } 253 254 /** 255 * @see org.htmlparser.Tag#getIds() 256 */ 257 public String[] getIds() { 258 259 return m_decorated.getIds(); 260 } 261 262 /** 263 * @see org.htmlparser.Node#getLastChild() 264 */ 265 public Node getLastChild() { 266 267 return m_decorated.getLastChild(); 268 } 269 270 /** 271 * @see org.htmlparser.Node#getNextSibling() 272 */ 273 public Node getNextSibling() { 274 275 return m_decorated.getNextSibling(); 276 } 277 278 /** 279 * @see org.htmlparser.Node#getPage() 280 */ 281 public Page getPage() { 282 283 return m_decorated.getPage(); 284 } 285 286 /** 287 * @see org.htmlparser.Node#getParent() 288 */ 289 public Node getParent() { 290 291 return m_decorated.getParent(); 292 } 293 294 /** 295 * @see org.htmlparser.Node#getPreviousSibling() 296 */ 297 public Node getPreviousSibling() { 298 299 return m_decorated.getPreviousSibling(); 300 } 301 302 /** 303 * @see org.htmlparser.Tag#getRawTagName() 304 */ 305 public String getRawTagName() { 306 307 return m_decorated.getRawTagName(); 308 } 309 310 /** 311 * @see org.htmlparser.Tag#getStartingLineNumber() 312 */ 313 public int getStartingLineNumber() { 314 315 return m_decorated.getStartingLineNumber(); 316 } 317 318 /** 319 * @see org.htmlparser.Node#getStartPosition() 320 */ 321 public int getStartPosition() { 322 323 return m_decorated.getStartPosition(); 324 } 325 326 /** 327 * @see org.htmlparser.Tag#getTagName() 328 */ 329 public String getTagName() { 330 331 return m_decorated.getTagName(); 332 } 333 334 /** 335 * @see org.htmlparser.Node#getText() 336 */ 337 public String getText() { 338 339 return m_decorated.getText(); 340 } 341 342 /** 343 * @see org.htmlparser.Tag#getThisScanner() 344 */ 345 public Scanner getThisScanner() { 346 347 return m_decorated.getThisScanner(); 348 } 349 350 /** 351 * @see org.htmlparser.Tag#isEmptyXmlTag() 352 */ 353 public boolean isEmptyXmlTag() { 354 355 return m_decorated.isEmptyXmlTag(); 356 } 357 358 /** 359 * @see org.htmlparser.Tag#isEndTag() 360 */ 361 public boolean isEndTag() { 362 363 return m_decorated.isEndTag(); 364 } 365 366 /** 367 * @see org.htmlparser.Tag#removeAttribute(java.lang.String) 368 */ 369 public void removeAttribute(String arg0) { 370 371 m_decorated.removeAttribute(arg0); 372 } 373 374 /** 375 * @see org.htmlparser.Tag#setAttribute(java.lang.String, java.lang.String) 376 */ 377 public void setAttribute(String arg0, String arg1) { 378 379 m_decorated.setAttribute(arg0, arg1); 380 } 381 382 /** 383 * @see org.htmlparser.Tag#setAttribute(java.lang.String, java.lang.String, char) 384 */ 385 public void setAttribute(String arg0, String arg1, char arg2) { 386 387 m_decorated.setAttribute(arg0, arg1, arg2); 388 } 389 390 /** 391 * @see org.htmlparser.Tag#setAttributeEx(org.htmlparser.Attribute) 392 */ 393 public void setAttributeEx(Attribute arg0) { 394 395 m_decorated.setAttributeEx(arg0); 396 } 397 398 /** 399 * @see org.htmlparser.Tag#setAttributesEx(java.util.Vector) 400 */ 401 public void setAttributesEx(Vector arg0) { 402 403 m_decorated.setAttributesEx(arg0); 404 } 405 406 /** 407 * @see org.htmlparser.Node#setChildren(org.htmlparser.util.NodeList) 408 */ 409 public void setChildren(NodeList arg0) { 410 411 m_decorated.setChildren(arg0); 412 } 413 414 /** 415 * @see org.htmlparser.Tag#setEmptyXmlTag(boolean) 416 */ 417 public void setEmptyXmlTag(boolean arg0) { 418 419 m_decorated.setEmptyXmlTag(arg0); 420 } 421 422 /** 423 * @see org.htmlparser.Node#setEndPosition(int) 424 */ 425 public void setEndPosition(int arg0) { 426 427 m_decorated.setEndPosition(arg0); 428 } 429 430 /** 431 * @see org.htmlparser.Tag#setEndTag(org.htmlparser.Tag) 432 */ 433 public void setEndTag(Tag arg0) { 434 435 m_decorated.setEndTag(arg0); 436 } 437 438 /** 439 * @see org.htmlparser.Node#setPage(org.htmlparser.lexer.Page) 440 */ 441 public void setPage(Page arg0) { 442 443 m_decorated.setPage(arg0); 444 } 445 446 /** 447 * @see org.htmlparser.Node#setParent(org.htmlparser.Node) 448 */ 449 public void setParent(Node arg0) { 450 451 m_decorated.setParent(arg0); 452 } 453 454 /** 455 * @see org.htmlparser.Node#setStartPosition(int) 456 */ 457 public void setStartPosition(int arg0) { 458 459 m_decorated.setStartPosition(arg0); 460 } 461 462 /** 463 * @see org.htmlparser.Tag#setTagName(java.lang.String) 464 */ 465 public void setTagName(String arg0) { 466 467 m_decorated.setTagName(arg0); 468 } 469 470 /** 471 * @see org.htmlparser.Node#setText(java.lang.String) 472 */ 473 public void setText(String arg0) { 474 475 m_decorated.setText(arg0); 476 } 477 478 /** 479 * @see org.htmlparser.Tag#setThisScanner(org.htmlparser.scanners.Scanner) 480 */ 481 public void setThisScanner(Scanner arg0) { 482 483 m_decorated.setThisScanner(arg0); 484 } 485 486 /** 487 * @see org.htmlparser.Node#toHtml() 488 */ 489 public String toHtml() { 490 491 return m_decorated.toHtml(); 492 } 493 494 /** 495 * @see org.htmlparser.Node#toHtml(boolean) 496 */ 497 public String toHtml(boolean value) { 498 499 return m_decorated.toHtml(value); 500 } 501 502 /** 503 * @see org.htmlparser.Node#toPlainTextString() 504 */ 505 public String toPlainTextString() { 506 507 return m_decorated.toPlainTextString(); 508 } 509 510 /** 511 * @see org.htmlparser.Node#toString() 512 */ 513 @Override 514 public String toString() { 515 516 return m_decorated.toString(); 517 } 518 519 /** 520 * @see org.htmlparser.Tag#toTagHtml() 521 */ 522 @Override 523 public String toTagHtml() { 524 525 return m_decorated.toTagHtml(); 526 } 527 } 528 529 /** The log object for this class. */ 530 private static final Log LOG = CmsLog.getLog(CmsHtmlTagRemoveFactory.class); 531 532 /** Generated serial version UID. */ 533 private static final long serialVersionUID = 6961158563666656633L; 534 535 /** The tags to hide tothe node visitors. */ 536 private Set<String> m_invisibleTags; 537 538 /** The tags to show to the node visitors. */ 539 private Set<String> m_visibleTags; 540 541 /** 542 * Create a new factory with all tags registered. 543 * <p> 544 * 545 */ 546 public CmsHtmlTagRemoveFactory() { 547 548 super(); 549 m_invisibleTags = new TreeSet<String>(); 550 m_visibleTags = new TreeSet<String>(); 551 } 552 553 /** 554 * Add a tag that will be visible for {@link NodeVisitor} instances. 555 * <p> 556 * 557 * Not only "this" tag will be visible but all parsed Tags that have the same name (case 558 * insensitive). 559 * <p> 560 * 561 * The given tag will be kept as-is. The following behaviour happens if this method is used: 562 * <ol> 563 * <li> 564 * Once <code>{@link #addTagPreserve(Tag)}</code> has been called all Tags that are not added 565 * to this method will be removed. <strong>We are in include mode then</strong>. 566 * </li> 567 * <li> 568 * The Tags provided to <code>{@link #addTagRemoval(Tag)}</code> will only have the 569 * power to hide exactly the same tags that are given to <code>{@link #addTagPreserve(Tag)}</code>: 570 * <strong>Deny is stronger than allow.</strong> 571 * </li> 572 * </ol> 573 * <p> 574 * 575 * 576 * @param tag the tag that will be visible for all {@link NodeVisitor} instances. 577 * 578 * @return true if the tag was added to the internal set of tags to keep, false if not (was 579 * contained before, has no name,...). 580 */ 581 public boolean addTagPreserve(final Tag tag) { 582 583 boolean result = false; 584 String tagName = tag.getTagName(); 585 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)) { 586 result = m_visibleTags.add(tagName.toLowerCase()); 587 } 588 return result; 589 590 } 591 592 /** 593 * Add a tag that will be invisible for {@link NodeVisitor} instances. 594 * <p> 595 * 596 * Not only "this" tag will be invisible but all parsed Tags that have the same name (case 597 * insensitive). 598 * <p> 599 * 600 * @param tag the tag that will be visible for all {@link NodeVisitor} instances. 601 * 602 * @return true if the tag was added to the internal set of tags to remove, false if not (was 603 * contained before, has no name,...). 604 */ 605 public boolean addTagRemoval(final Tag tag) { 606 607 boolean result = false; 608 String tagName = tag.getTagName(); 609 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)) { 610 result = m_invisibleTags.add(tagName.toLowerCase()); 611 } 612 return result; 613 } 614 615 /** 616 * @see org.htmlparser.PrototypicalNodeFactory#createTagNode(org.htmlparser.lexer.Page, int, 617 * int, java.util.Vector) 618 */ 619 @Override 620 public Tag createTagNode(Page arg0, int arg1, int arg2, Vector arg3) { 621 622 try { 623 String tagName = ((Attribute)arg3.get(0)).getName().toLowerCase(); 624 // end tags have names like "/a".... 625 if (tagName.charAt(0) == '/') { 626 tagName = tagName.substring(1); 627 } 628 Tag result = super.createTagNode(arg0, arg1, arg2, arg3); 629 if (!keepTag(tagName)) { 630 result = new CmsInvisibleTag(result); 631 } 632 return result; 633 } catch (RuntimeException rte) { 634 if (LOG.isErrorEnabled()) { 635 // log here, as htmlparser 1.5 did swallow exceptions from here and threw NPEs from 636 // other places 637 LOG.error(rte); 638 } 639 throw rte; 640 } 641 } 642 643 /** 644 * Encapsulation of the "preserve / remove" logic.<p> 645 * 646 * @param tagName the lower case name of the tag to keep or hide 647 * 648 * @return if true the given Tag will be kept, if false it will be removed 649 */ 650 private boolean keepTag(final String tagName) { 651 652 boolean result = false; 653 // include mode: 654 if (m_visibleTags.size() > 0) { 655 if (m_visibleTags.contains(tagName)) { 656 result = true; 657 } else { 658 result = false; 659 } 660 } 661 // Power of hide: if no visible tags configured this works as a normal remove, 662 // if visible tags are configured this can change a visible tag to be invisible 663 if (m_invisibleTags.contains(tagName)) { 664 result = false; 665 } 666 667 return result; 668 } 669}