001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.xml.types; 029 030import org.opencms.file.CmsObject; 031import org.opencms.i18n.CmsEncoder; 032import org.opencms.main.CmsLog; 033import org.opencms.main.CmsRuntimeException; 034import org.opencms.relations.CmsLink; 035import org.opencms.relations.CmsLinkUpdateUtil; 036import org.opencms.staticexport.CmsLinkProcessor; 037import org.opencms.staticexport.CmsLinkTable; 038import org.opencms.util.CmsHtmlConverter; 039import org.opencms.util.CmsHtmlExtractor; 040import org.opencms.util.CmsStringUtil; 041import org.opencms.xml.CmsXmlGenericWrapper; 042import org.opencms.xml.I_CmsXmlDocument; 043import org.opencms.xml.page.CmsXmlPage; 044import org.opencms.xml.xml2json.I_CmsJsonFormattableValue; 045 046import java.util.Iterator; 047import java.util.Locale; 048 049import org.apache.commons.logging.Log; 050 051import org.dom4j.Attribute; 052import org.dom4j.Element; 053import org.htmlparser.util.ParserException; 054 055/** 056 * Describes the XML content type "OpenCmsHtml".<p> 057 * 058 * @since 6.0.0 059 */ 060public class CmsXmlHtmlValue extends A_CmsXmlContentValue implements I_CmsJsonFormattableValue { 061 062 /** The name of this type as used in the XML schema. */ 063 public static final String TYPE_NAME = "OpenCmsHtml"; 064 065 /** The log object for this class. */ 066 private static final Log LOG = CmsLog.getLog(CmsXmlHtmlValue.class); 067 068 /** The schema definition String is located in a text for easier editing. */ 069 private static String m_schemaDefinition; 070 071 /** Null value for plain text extraction errors. */ 072 private static final String NULL_VALUE = "null"; 073 074 /** Base type for single type instances, required for XML pages. */ 075 private static final I_CmsXmlSchemaType TYPE_BASE = new CmsXmlHtmlValue("base", "1", "1"); 076 077 /** The plain text value of the element node. */ 078 private String m_plainTextValue; 079 080 /** The String value of the element node. */ 081 private String m_stringValue; 082 083 /** 084 * Creates a new, empty schema type descriptor of type "OpenCmsHtml".<p> 085 */ 086 public CmsXmlHtmlValue() { 087 088 // empty constructor is required for class registration 089 } 090 091 /** 092 * Creates a new XML content value of type "OpenCmsHtml".<p> 093 * 094 * @param document the XML content instance this value belongs to 095 * @param element the XML element that contains this value 096 * @param locale the locale this value is created for 097 */ 098 public CmsXmlHtmlValue(I_CmsXmlDocument document, Element element, Locale locale) { 099 100 super(document, element, locale, TYPE_BASE); 101 } 102 103 /** 104 * Creates a new XML content value of type "OpenCmsHtml".<p> 105 * 106 * @param document the XML content instance this value belongs to 107 * @param element the XML element that contains this value 108 * @param locale the locale this value is created for 109 * @param type the type instance to create the value for 110 */ 111 public CmsXmlHtmlValue(I_CmsXmlDocument document, Element element, Locale locale, I_CmsXmlSchemaType type) { 112 113 super(document, element, locale, type); 114 } 115 116 /** 117 * Creates a new schema type descriptor for the type "OpenCmsHtml".<p> 118 * 119 * @param name the name of the XML node containing the value according to the XML schema 120 * @param minOccurs minimum number of occurrences of this type according to the XML schema 121 * @param maxOccurs maximum number of occurrences of this type according to the XML schema 122 */ 123 public CmsXmlHtmlValue(String name, String minOccurs, String maxOccurs) { 124 125 super(name, minOccurs, maxOccurs); 126 } 127 128 /** 129 * @see org.opencms.xml.types.A_CmsXmlContentValue#createValue(I_CmsXmlDocument, org.dom4j.Element, Locale) 130 */ 131 public I_CmsXmlContentValue createValue(I_CmsXmlDocument document, Element element, Locale locale) { 132 133 return new CmsXmlHtmlValue(document, element, locale, this); 134 } 135 136 /** 137 * @see org.opencms.xml.types.I_CmsXmlSchemaType#generateXml(org.opencms.file.CmsObject, org.opencms.xml.I_CmsXmlDocument, org.dom4j.Element, java.util.Locale) 138 */ 139 @Override 140 public Element generateXml(CmsObject cms, I_CmsXmlDocument document, Element root, Locale locale) { 141 142 Element element = root.addElement(getName()); 143 int index = element.getParent().elements(element.getQName()).indexOf(element); 144 element.addAttribute(CmsXmlPage.ATTRIBUTE_NAME, getName() + index); 145 element.addElement(CmsXmlPage.NODE_LINKS); 146 element.addElement(CmsXmlPage.NODE_CONTENT); 147 148 // get the default value from the content handler 149 String defaultValue = document.getHandler().getDefault(cms, this, locale); 150 if (defaultValue != null) { 151 try { 152 I_CmsXmlContentValue value = createValue(document, element, locale); 153 value.setStringValue(cms, defaultValue); 154 } catch (CmsRuntimeException e) { 155 // should not happen if default value is correct 156 LOG.error( 157 Messages.get().getBundle().key(Messages.ERR_XMLCONTENT_INVALID_ELEM_DEFAULT_1, defaultValue), 158 e); 159 element.clearContent(); 160 } 161 } 162 return element; 163 } 164 165 /** 166 * Returns the link table of this XML page element.<p> 167 * 168 * @return the link table of this XML page element 169 */ 170 public CmsLinkTable getLinkTable() { 171 172 CmsLinkTable linkTable = new CmsLinkTable(); 173 Element links = m_element.element(CmsXmlPage.NODE_LINKS); 174 if (links != null) { 175 Iterator<Element> itLinks = CmsXmlGenericWrapper.elementIterator(links, CmsXmlPage.NODE_LINK); 176 while (itLinks.hasNext()) { 177 Element lelem = itLinks.next(); 178 linkTable.addLink(new CmsLink(lelem)); 179 } 180 } 181 return linkTable; 182 } 183 184 /** 185 * @see org.opencms.xml.types.I_CmsXmlContentValue#getPlainText(org.opencms.file.CmsObject) 186 */ 187 @Override 188 public String getPlainText(CmsObject cms) { 189 190 if (m_plainTextValue == null) { 191 try { 192 m_plainTextValue = CmsHtmlExtractor.extractText(getStringValue(cms), m_document.getEncoding()); 193 } catch (Exception exc) { 194 m_plainTextValue = NULL_VALUE; 195 } 196 } 197 if (m_plainTextValue == NULL_VALUE) { 198 return null; 199 } 200 return m_plainTextValue; 201 } 202 203 /** 204 * @see org.opencms.xml.types.I_CmsXmlSchemaType#getSchemaDefinition() 205 */ 206 public String getSchemaDefinition() { 207 208 // the schema definition is located in a separate file for easier editing 209 if (m_schemaDefinition == null) { 210 m_schemaDefinition = readSchemaDefinition("org/opencms/xml/types/XmlHtmlValue.xsd"); 211 } 212 return m_schemaDefinition; 213 } 214 215 /** 216 * @see org.opencms.xml.types.I_CmsXmlContentValue#getStringValue(org.opencms.file.CmsObject) 217 */ 218 public String getStringValue(CmsObject cms) { 219 220 if (m_stringValue == null) { 221 m_stringValue = createStringValue(cms, m_document); 222 } 223 224 return m_stringValue; 225 } 226 227 /** 228 * @see org.opencms.xml.types.A_CmsXmlContentValue#getTypeName() 229 */ 230 public String getTypeName() { 231 232 return TYPE_NAME; 233 } 234 235 /** 236 * @see org.opencms.xml.types.A_CmsXmlContentValue#newInstance(java.lang.String, java.lang.String, java.lang.String) 237 */ 238 public I_CmsXmlSchemaType newInstance(String name, String minOccurs, String maxOccurs) { 239 240 return new CmsXmlHtmlValue(name, minOccurs, maxOccurs); 241 } 242 243 /** 244 * @see org.opencms.xml.types.I_CmsXmlContentValue#setStringValue(org.opencms.file.CmsObject, java.lang.String) 245 */ 246 public void setStringValue(CmsObject cms, String value) { 247 248 Element content = m_element.element(CmsXmlPage.NODE_CONTENT); 249 Element links = m_element.element(CmsXmlPage.NODE_LINKS); 250 CmsLinkProcessor linkProcessor = null; 251 252 String encoding = m_document.getEncoding(); 253 linkProcessor = m_document.getLinkProcessor(cms, new CmsLinkTable()); 254 255 String finalValue = value; 256 if (finalValue != null) { 257 // nested CDATA tags are not allowed, so replace CDATA tags with their contents 258 finalValue = finalValue.replaceAll("(?s)// <!\\[CDATA\\[(.*?)// \\]\\]>", "$1"); // special case for embedded Javascript 259 finalValue = finalValue.replaceAll("(?s)<!\\[CDATA\\[(.*?)\\]\\]>", "$1"); 260 } 261 if (encoding != null) { 262 // ensure all chars in the given content are valid chars for the selected charset 263 finalValue = CmsEncoder.adjustHtmlEncoding(finalValue, encoding); 264 } 265 266 // remove unnecessary tags if required 267 String contentConversion = m_document.getConversion(); 268 if (CmsHtmlConverter.isConversionEnabled(contentConversion)) { 269 CmsHtmlConverter converter = new CmsHtmlConverter(encoding, contentConversion); 270 finalValue = converter.convertToStringSilent(finalValue); 271 finalValue = fixNullCharacters(finalValue); 272 } 273 if (linkProcessor != null) { 274 try { 275 // replace links in HTML by macros and fill link table 276 finalValue = linkProcessor.replaceLinks(finalValue); 277 } catch (Exception exc) { 278 throw new CmsRuntimeException(Messages.get().container(Messages.ERR_HTML_DATA_PROCESSING_0), exc); 279 } 280 } 281 282 content.clearContent(); 283 links.clearContent(); 284 285 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(finalValue)) { 286 content.addCDATA(finalValue); 287 if (linkProcessor != null) { 288 // may be null in case of default value generation (i.e. setStringValue(String) was called) 289 290 CmsLinkTable linkTable = linkProcessor.getLinkTable(); 291 for (Iterator<CmsLink> i = linkTable.iterator(); i.hasNext();) { 292 CmsLink link = i.next(); 293 CmsLinkUpdateUtil.updateXmlForHtmlValue( 294 link, 295 link.getName(), 296 links.addElement(CmsXmlPage.NODE_LINK)); 297 } 298 } 299 } 300 301 // ensure the String value is re-calculated next time 302 m_stringValue = null; 303 } 304 305 /** 306 * @see org.opencms.xml.xml2json.I_CmsJsonFormattableValue#toJson(org.opencms.file.CmsObject) 307 */ 308 public Object toJson(CmsObject cms) { 309 310 return getStringValue(cms); 311 } 312 313 /** 314 * JTidy sometimes erroneouslsy produces HTML containing 'null' characters (Unicode code point 0), which are 315 * invalid in an XML document. Until we find a way to prevent JTidy doing that, we remove the null characters 316 * from the HTML, and log a warning.<p> 317 * 318 * @param jtidyOutput the JTidy output 319 * @return the output with null characters removed 320 */ 321 protected String fixNullCharacters(String jtidyOutput) { 322 323 String outputWithoutNullChars = jtidyOutput.replaceAll("\u0000", ""); 324 if (jtidyOutput.length() != outputWithoutNullChars.length()) { 325 String context = ""; 326 if (m_document.getFile() != null) { 327 context = "(file=" + m_document.getFile().getRootPath() + ")"; 328 } 329 LOG.warn("HTML cleanup produced invalid null characters in output. " + context); 330 LOG.debug("HTML cleanup output = " + jtidyOutput); 331 } 332 return outputWithoutNullChars; 333 } 334 335 /** 336 * Creates the String value for this HTML value element.<p> 337 * 338 * @param cms an initialized instance of a CmsObject 339 * @param document the XML document this value belongs to 340 * 341 * @return the String value for this HTML value element 342 */ 343 private String createStringValue(CmsObject cms, I_CmsXmlDocument document) { 344 345 Element data = m_element.element(CmsXmlPage.NODE_CONTENT); 346 if (data == null) { 347 String content = m_element.getText(); 348 m_element.clearContent(); 349 int index = m_element.getParent().elements(m_element.getQName()).indexOf(m_element); 350 m_element.addAttribute(CmsXmlPage.ATTRIBUTE_NAME, getName() + index); 351 m_element.addElement(CmsXmlPage.NODE_LINKS); 352 m_element.addElement(CmsXmlPage.NODE_CONTENT).addCDATA(content); 353 data = m_element.element(CmsXmlPage.NODE_CONTENT); 354 } 355 Attribute enabled = m_element.attribute(CmsXmlPage.ATTRIBUTE_ENABLED); 356 357 String content = ""; 358 if ((enabled == null) || Boolean.valueOf(enabled.getText()).booleanValue()) { 359 360 content = data.getText(); 361 362 CmsLinkTable linkTable = getLinkTable(); 363 if (!linkTable.isEmpty()) { 364 365 // link processing: replace macros with links 366 CmsLinkProcessor linkProcessor = document.getLinkProcessor(cms, linkTable); 367 try { 368 content = linkProcessor.processLinks(content); 369 } catch (ParserException e) { 370 // should better not happen 371 LOG.error(Messages.get().getBundle().key(Messages.ERR_XMLCONTENT_LINK_PROCESS_FAILED_0), e); 372 } 373 } 374 } 375 return content; 376 } 377}