001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.extractors; 029 030import org.opencms.util.CmsStringUtil; 031 032import java.io.ByteArrayInputStream; 033import java.io.ByteArrayOutputStream; 034import java.io.ObjectInputStream; 035import java.io.ObjectOutputStream; 036import java.io.Serializable; 037import java.util.Arrays; 038import java.util.Collection; 039import java.util.HashMap; 040import java.util.HashSet; 041import java.util.LinkedHashMap; 042import java.util.List; 043import java.util.Locale; 044import java.util.Map; 045 046/** 047 * The result of a document text extraction.<p> 048 * 049 * This data structure contains the extracted text as well as (optional) 050 * meta information extracted from the document.<p> 051 * 052 * @since 6.0.0 053 */ 054public class CmsExtractionResult implements I_CmsExtractionResult, Serializable { 055 056 /** UID required for safe serialization. */ 057 private static final long serialVersionUID = 1465447302192195154L; 058 059 /** The extracted individual content items. */ 060 private Map<Locale, LinkedHashMap<String, String>> m_contentItems; 061 062 /** The locales of the content. */ 063 private Collection<Locale> m_locales; 064 065 /** The default locale of the content. Can be <code>null</code> for unilingual extraction results. */ 066 private Locale m_defaultLocale; 067 068 /** The extracted values directly added to the index. */ 069 private Map<String, String> m_fieldMappings; 070 071 /** The serialized version of this object. */ 072 private byte[] m_serializedVersion; 073 074 /** Creates a new multilingual extraction result. 075 * @param defaultLocale the default (best fitting) locale of the result. 076 * @param multilingualContentItems the content items for the different locales 077 * @param fieldMappings special mappings to search fields with values extracted from the content 078 */ 079 public CmsExtractionResult( 080 Locale defaultLocale, 081 Map<Locale, LinkedHashMap<String, String>> multilingualContentItems, 082 Map<String, String> fieldMappings) { 083 084 m_defaultLocale = defaultLocale; 085 m_contentItems = null != multilingualContentItems 086 ? removeNullEntries(multilingualContentItems) 087 : new HashMap<Locale, LinkedHashMap<String, String>>(1); 088 089 // set the locales 090 m_locales = new HashSet<Locale>(); 091 for (Locale locale : m_contentItems.keySet()) { 092 if (null != locale) { 093 m_locales.add(locale); 094 } 095 } 096 097 // ensure that a version for the default locale is present just to prevent null-checks 098 if (null == m_contentItems.get(m_defaultLocale)) { 099 m_contentItems.put(m_defaultLocale, new LinkedHashMap<String, String>()); 100 } 101 m_fieldMappings = null != fieldMappings ? fieldMappings : new HashMap<String, String>(); 102 103 } 104 105 /** 106 * Creates a new extraction result without meta information and without additional fields.<p> 107 * 108 * @param content the extracted content 109 */ 110 public CmsExtractionResult(String content) { 111 112 this(content, null, null); 113 m_contentItems.get(m_defaultLocale).put(ITEM_RAW, content); 114 } 115 116 /** 117 * Creates a new unilingual extraction result.<p> 118 * 119 * @param content the extracted content 120 * @param contentItems the individual extracted content items 121 */ 122 public CmsExtractionResult(String content, LinkedHashMap<String, String> contentItems) { 123 124 this(content, contentItems, null); 125 } 126 127 /** 128 * Creates a new unilingual extraction result.<p> 129 * 130 * @param content the extracted content 131 * @param contentItems the individual extracted content items 132 * @param fieldMappings extraction results that should directly be indexed 133 */ 134 public CmsExtractionResult( 135 String content, 136 LinkedHashMap<String, String> contentItems, 137 Map<String, String> fieldMappings) { 138 139 m_defaultLocale = null; 140 m_locales = new HashSet<Locale>(); 141 m_contentItems = new LinkedHashMap<Locale, LinkedHashMap<String, String>>(1); 142 if (fieldMappings != null) { 143 m_fieldMappings = fieldMappings; 144 } else { 145 m_fieldMappings = new HashMap<String, String>(); 146 } 147 if (contentItems != null) { 148 m_contentItems.put(m_defaultLocale, contentItems); 149 } else { 150 m_contentItems.put(m_defaultLocale, new LinkedHashMap<String, String>()); 151 } 152 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(content)) { 153 m_contentItems.get(m_defaultLocale).put(ITEM_CONTENT, content); 154 } 155 } 156 157 /** 158 * Creates an extraction result from a serialized byte array.<p> 159 * 160 * @param bytes the serialized version of the extraction result 161 * 162 * @return extraction result created from the serialized byte array 163 */ 164 public static final CmsExtractionResult fromBytes(byte[] bytes) { 165 166 Object obj = null; 167 if (bytes != null) { 168 // create an object out of the byte array 169 try { 170 ByteArrayInputStream in = new ByteArrayInputStream(bytes); 171 ObjectInputStream oin = new ObjectInputStream(in); 172 obj = oin.readObject(); 173 oin.close(); 174 } catch (Exception e) { 175 // ignore, null is not an instance of CmsExtractionResult 176 } 177 if (obj instanceof CmsExtractionResult) { 178 CmsExtractionResult result = (CmsExtractionResult)obj; 179 result.m_serializedVersion = bytes; 180 return result; 181 } 182 } 183 return null; 184 } 185 186 /** 187 * @see org.opencms.search.extractors.I_CmsExtractionResult#getBytes() 188 */ 189 public byte[] getBytes() { 190 191 // check if we have a cached version of the serialized object available 192 if (m_serializedVersion != null) { 193 return m_serializedVersion; 194 } 195 try { 196 // serialize this object and return 197 ByteArrayOutputStream out = new ByteArrayOutputStream(512); 198 ObjectOutputStream oout = new ObjectOutputStream(out); 199 oout.writeObject(this); 200 oout.close(); 201 m_serializedVersion = out.toByteArray(); 202 } catch (Exception e) { 203 // ignore, serialized version will be null 204 } 205 return m_serializedVersion; 206 } 207 208 /** 209 * @see org.opencms.search.extractors.I_CmsExtractionResult#getContent() 210 */ 211 public String getContent() { 212 213 return m_contentItems.get(m_defaultLocale).get(ITEM_CONTENT); 214 } 215 216 /** 217 * @see org.opencms.search.extractors.I_CmsExtractionResult#getContent(java.util.Locale) 218 */ 219 public String getContent(Locale locale) { 220 221 Map<String, String> localeItems = m_contentItems.get(locale); 222 return null == localeItems ? null : localeItems.get(ITEM_CONTENT); 223 } 224 225 /** 226 * @see org.opencms.search.extractors.I_CmsExtractionResult#getContentItems() 227 */ 228 public LinkedHashMap<String, String> getContentItems() { 229 230 return m_contentItems.get(m_defaultLocale); 231 } 232 233 /** 234 * @see org.opencms.search.extractors.I_CmsExtractionResult#getContentItems(java.util.Locale) 235 */ 236 public LinkedHashMap<String, String> getContentItems(Locale locale) { 237 238 LinkedHashMap<String, String> localeItems = m_contentItems.get(locale); 239 return null == localeItems ? new LinkedHashMap<String, String>() : localeItems; 240 } 241 242 /** 243 * @see org.opencms.search.extractors.I_CmsExtractionResult#getDefaultLocale() 244 */ 245 public Locale getDefaultLocale() { 246 247 return m_defaultLocale; 248 } 249 250 /** 251 * @see org.opencms.search.extractors.I_CmsExtractionResult#getFieldMappings() 252 */ 253 public Map<String, String> getFieldMappings() { 254 255 return m_fieldMappings; 256 } 257 258 /** 259 * @see org.opencms.search.extractors.I_CmsExtractionResult#getLocales() 260 */ 261 public Collection<Locale> getLocales() { 262 263 return m_locales; 264 } 265 266 /** 267 * @see org.opencms.search.extractors.I_CmsExtractionResult#merge(java.util.List) 268 */ 269 public I_CmsExtractionResult merge(List<I_CmsExtractionResult> extractionResults) { 270 271 //prepare copy 272 Map<Locale, LinkedHashMap<String, String>> contentItems = new HashMap<Locale, LinkedHashMap<String, String>>( 273 m_locales.size()); 274 for (Locale locale : m_locales) { 275 LinkedHashMap<String, String> originalLocalValues = m_contentItems.get(locale); 276 LinkedHashMap<String, String> localeValues = new LinkedHashMap<String, String>(originalLocalValues); 277 contentItems.put(locale, localeValues); 278 } 279 280 HashMap<String, String> fieldMappings = new HashMap<String, String>(m_fieldMappings.size()); 281 for (String fieldMapping : m_fieldMappings.keySet()) { 282 fieldMappings.put(fieldMapping, m_fieldMappings.get(fieldMapping)); 283 } 284 285 //merge content from the other extraction results 286 for (Locale locale : m_locales) { 287 Map<String, String> localeValues = contentItems.get(locale); 288 for (I_CmsExtractionResult result : extractionResults) { 289 if (result.getLocales().contains(locale) || result.getLocales().isEmpty()) { 290 Map<String, String> resultLocaleValues = result.getLocales().isEmpty() 291 ? result.getContentItems() 292 : result.getContentItems(locale); 293 for (String item : Arrays.asList(ITEMS_TO_MERGE)) { 294 localeValues = mergeItem(item, localeValues, resultLocaleValues); 295 } 296 } 297 } 298 } 299 return new CmsExtractionResult(m_defaultLocale, contentItems, fieldMappings); 300 } 301 302 /** 303 * @see org.opencms.search.extractors.I_CmsExtractionResult#release() 304 */ 305 public void release() { 306 307 if (!m_contentItems.isEmpty()) { 308 m_contentItems.clear(); 309 } 310 m_contentItems = null; 311 m_serializedVersion = null; 312 } 313 314 /** Merges the item from the resultLocaleValues into the corresponding item of the localeValues. 315 * @param item the item to merge 316 * @param localeValues the values where the item gets merged into 317 * @param resultLocaleValues the values where the item to merge is read from 318 * @return the modified localeValues with the merged item 319 */ 320 private Map<String, String> mergeItem( 321 String item, 322 Map<String, String> localeValues, 323 Map<String, String> resultLocaleValues) { 324 325 if (resultLocaleValues.get(item) != null) { 326 if (localeValues.get(item) != null) { 327 localeValues.put(item, localeValues.get(item) + " " + resultLocaleValues.get(item)); 328 } else { 329 localeValues.put(item, resultLocaleValues.get(item)); 330 } 331 } 332 333 return localeValues; 334 } 335 336 /** Replaces all <code>null</code> values with empty maps. 337 * @param multilingualContentItems the map where replacement should take place 338 * @return the map with all <code>null</code> values replaced with empty maps. 339 */ 340 private Map<Locale, LinkedHashMap<String, String>> removeNullEntries( 341 Map<Locale, LinkedHashMap<String, String>> multilingualContentItems) { 342 343 for (Locale locale : multilingualContentItems.keySet()) { 344 if (null == multilingualContentItems.get(locale)) { 345 multilingualContentItems.put(locale, new LinkedHashMap<String, String>()); 346 } 347 } 348 return multilingualContentItems; 349 } 350}