001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.extractors;
029
030import org.opencms.util.CmsStringUtil;
031
032import java.io.ByteArrayInputStream;
033import java.io.ByteArrayOutputStream;
034import java.io.ObjectInputStream;
035import java.io.ObjectOutputStream;
036import java.io.Serializable;
037import java.util.Arrays;
038import java.util.Collection;
039import java.util.HashMap;
040import java.util.HashSet;
041import java.util.LinkedHashMap;
042import java.util.List;
043import java.util.Locale;
044import java.util.Map;
045
046/**
047 * The result of a document text extraction.<p>
048 *
049 * This data structure contains the extracted text as well as (optional)
050 * meta information extracted from the document.<p>
051 *
052 * @since 6.0.0
053 */
054public class CmsExtractionResult implements I_CmsExtractionResult, Serializable {
055
056    /** UID required for safe serialization. */
057    private static final long serialVersionUID = 1465447302192195154L;
058
059    /** The extracted individual content items. */
060    private Map<Locale, LinkedHashMap<String, String>> m_contentItems;
061
062    /** The locales of the content. */
063    private Collection<Locale> m_locales;
064
065    /** The default locale of the content. Can be <code>null</code> for unilingual extraction results. */
066    private Locale m_defaultLocale;
067
068    /** The extracted values directly added to the index. */
069    private Map<String, String> m_fieldMappings;
070
071    /** The serialized version of this object. */
072    private byte[] m_serializedVersion;
073
074    /** Creates a new multilingual extraction result.
075     * @param defaultLocale the default (best fitting) locale of the result.
076     * @param multilingualContentItems the content items for the different locales
077     * @param fieldMappings special mappings to search fields with values extracted from the content
078     */
079    public CmsExtractionResult(
080        Locale defaultLocale,
081        Map<Locale, LinkedHashMap<String, String>> multilingualContentItems,
082        Map<String, String> fieldMappings) {
083
084        m_defaultLocale = defaultLocale;
085        m_contentItems = null != multilingualContentItems
086        ? removeNullEntries(multilingualContentItems)
087        : new HashMap<Locale, LinkedHashMap<String, String>>(1);
088
089        // set the locales
090        m_locales = new HashSet<Locale>();
091        for (Locale locale : m_contentItems.keySet()) {
092            if (null != locale) {
093                m_locales.add(locale);
094            }
095        }
096
097        // ensure that a version for the default locale is present just to prevent null-checks
098        if (null == m_contentItems.get(m_defaultLocale)) {
099            m_contentItems.put(m_defaultLocale, new LinkedHashMap<String, String>());
100        }
101        m_fieldMappings = null != fieldMappings ? fieldMappings : new HashMap<String, String>();
102
103    }
104
105    /**
106     * Creates a new extraction result without meta information and without additional fields.<p>
107     *
108     * @param content the extracted content
109     */
110    public CmsExtractionResult(String content) {
111
112        this(content, null, null);
113        m_contentItems.get(m_defaultLocale).put(ITEM_RAW, content);
114    }
115
116    /**
117     * Creates a new unilingual extraction result.<p>
118     *
119     * @param content the extracted content
120     * @param contentItems the individual extracted content items
121     */
122    public CmsExtractionResult(String content, LinkedHashMap<String, String> contentItems) {
123
124        this(content, contentItems, null);
125    }
126
127    /**
128     * Creates a new unilingual extraction result.<p>
129     *
130     * @param content the extracted content
131     * @param contentItems the individual extracted content items
132     * @param fieldMappings extraction results that should directly be indexed
133     */
134    public CmsExtractionResult(
135        String content,
136        LinkedHashMap<String, String> contentItems,
137        Map<String, String> fieldMappings) {
138
139        m_defaultLocale = null;
140        m_locales = new HashSet<Locale>();
141        m_contentItems = new LinkedHashMap<Locale, LinkedHashMap<String, String>>(1);
142        if (fieldMappings != null) {
143            m_fieldMappings = fieldMappings;
144        } else {
145            m_fieldMappings = new HashMap<String, String>();
146        }
147        if (contentItems != null) {
148            m_contentItems.put(m_defaultLocale, contentItems);
149        } else {
150            m_contentItems.put(m_defaultLocale, new LinkedHashMap<String, String>());
151        }
152        if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(content)) {
153            m_contentItems.get(m_defaultLocale).put(ITEM_CONTENT, content);
154        }
155    }
156
157    /**
158     * Creates an extraction result from a serialized byte array.<p>
159     *
160     * @param bytes the serialized version of the extraction result
161     *
162     * @return extraction result created from the serialized byte array
163     */
164    public static final CmsExtractionResult fromBytes(byte[] bytes) {
165
166        Object obj = null;
167        if (bytes != null) {
168            // create an object out of the byte array
169            try {
170                ByteArrayInputStream in = new ByteArrayInputStream(bytes);
171                ObjectInputStream oin = new ObjectInputStream(in);
172                obj = oin.readObject();
173                oin.close();
174            } catch (Exception e) {
175                // ignore, null is not an instance of CmsExtractionResult
176            }
177            if (obj instanceof CmsExtractionResult) {
178                CmsExtractionResult result = (CmsExtractionResult)obj;
179                result.m_serializedVersion = bytes;
180                return result;
181            }
182        }
183        return null;
184    }
185
186    /**
187     * @see org.opencms.search.extractors.I_CmsExtractionResult#getBytes()
188     */
189    public byte[] getBytes() {
190
191        // check if we have a cached version of the serialized object available
192        if (m_serializedVersion != null) {
193            return m_serializedVersion;
194        }
195        try {
196            // serialize this object and return
197            ByteArrayOutputStream out = new ByteArrayOutputStream(512);
198            ObjectOutputStream oout = new ObjectOutputStream(out);
199            oout.writeObject(this);
200            oout.close();
201            m_serializedVersion = out.toByteArray();
202        } catch (Exception e) {
203            // ignore, serialized version will be null
204        }
205        return m_serializedVersion;
206    }
207
208    /**
209     * @see org.opencms.search.extractors.I_CmsExtractionResult#getContent()
210     */
211    public String getContent() {
212
213        return m_contentItems.get(m_defaultLocale).get(ITEM_CONTENT);
214    }
215
216    /**
217     * @see org.opencms.search.extractors.I_CmsExtractionResult#getContent(java.util.Locale)
218     */
219    public String getContent(Locale locale) {
220
221        Map<String, String> localeItems = m_contentItems.get(locale);
222        return null == localeItems ? null : localeItems.get(ITEM_CONTENT);
223    }
224
225    /**
226     * @see org.opencms.search.extractors.I_CmsExtractionResult#getContentItems()
227     */
228    public LinkedHashMap<String, String> getContentItems() {
229
230        return m_contentItems.get(m_defaultLocale);
231    }
232
233    /**
234     * @see org.opencms.search.extractors.I_CmsExtractionResult#getContentItems(java.util.Locale)
235     */
236    public LinkedHashMap<String, String> getContentItems(Locale locale) {
237
238        LinkedHashMap<String, String> localeItems = m_contentItems.get(locale);
239        return null == localeItems ? new LinkedHashMap<String, String>() : localeItems;
240    }
241
242    /**
243     * @see org.opencms.search.extractors.I_CmsExtractionResult#getDefaultLocale()
244     */
245    public Locale getDefaultLocale() {
246
247        return m_defaultLocale;
248    }
249
250    /**
251     * @see org.opencms.search.extractors.I_CmsExtractionResult#getFieldMappings()
252     */
253    public Map<String, String> getFieldMappings() {
254
255        return m_fieldMappings;
256    }
257
258    /**
259     * @see org.opencms.search.extractors.I_CmsExtractionResult#getLocales()
260     */
261    public Collection<Locale> getLocales() {
262
263        return m_locales;
264    }
265
266    /**
267     * @see org.opencms.search.extractors.I_CmsExtractionResult#merge(java.util.List)
268     */
269    public I_CmsExtractionResult merge(List<I_CmsExtractionResult> extractionResults) {
270
271        //prepare copy
272        Map<Locale, LinkedHashMap<String, String>> contentItems = new HashMap<Locale, LinkedHashMap<String, String>>(
273            m_locales.size());
274        for (Locale locale : m_locales) {
275            LinkedHashMap<String, String> originalLocalValues = m_contentItems.get(locale);
276            LinkedHashMap<String, String> localeValues = new LinkedHashMap<String, String>(originalLocalValues);
277            contentItems.put(locale, localeValues);
278        }
279
280        HashMap<String, String> fieldMappings = new HashMap<String, String>(m_fieldMappings.size());
281        for (String fieldMapping : m_fieldMappings.keySet()) {
282            fieldMappings.put(fieldMapping, m_fieldMappings.get(fieldMapping));
283        }
284
285        //merge content from the other extraction results
286        for (Locale locale : m_locales) {
287            Map<String, String> localeValues = contentItems.get(locale);
288            for (I_CmsExtractionResult result : extractionResults) {
289                if (result.getLocales().contains(locale) || result.getLocales().isEmpty()) {
290                    Map<String, String> resultLocaleValues = result.getLocales().isEmpty()
291                    ? result.getContentItems()
292                    : result.getContentItems(locale);
293                    for (String item : Arrays.asList(ITEMS_TO_MERGE)) {
294                        localeValues = mergeItem(item, localeValues, resultLocaleValues);
295                    }
296                }
297            }
298        }
299        return new CmsExtractionResult(m_defaultLocale, contentItems, fieldMappings);
300    }
301
302    /**
303     * @see org.opencms.search.extractors.I_CmsExtractionResult#release()
304     */
305    public void release() {
306
307        if (!m_contentItems.isEmpty()) {
308            m_contentItems.clear();
309        }
310        m_contentItems = null;
311        m_serializedVersion = null;
312    }
313
314    /** Merges the item from the resultLocaleValues into the corresponding item of the localeValues.
315     * @param item the item to merge
316     * @param localeValues the values where the item gets merged into
317     * @param resultLocaleValues the values where the item to merge is read from
318     * @return the modified localeValues with the merged item
319     */
320    private Map<String, String> mergeItem(
321        String item,
322        Map<String, String> localeValues,
323        Map<String, String> resultLocaleValues) {
324
325        if (resultLocaleValues.get(item) != null) {
326            if (localeValues.get(item) != null) {
327                localeValues.put(item, localeValues.get(item) + " " + resultLocaleValues.get(item));
328            } else {
329                localeValues.put(item, resultLocaleValues.get(item));
330            }
331        }
332
333        return localeValues;
334    }
335
336    /** Replaces all <code>null</code> values with empty maps.
337     * @param multilingualContentItems the map where replacement should take place
338     * @return the map with all <code>null</code> values replaced with empty maps.
339     */
340    private Map<Locale, LinkedHashMap<String, String>> removeNullEntries(
341        Map<Locale, LinkedHashMap<String, String>> multilingualContentItems) {
342
343        for (Locale locale : multilingualContentItems.keySet()) {
344            if (null == multilingualContentItems.get(locale)) {
345                multilingualContentItems.put(locale, new LinkedHashMap<String, String>());
346            }
347        }
348        return multilingualContentItems;
349    }
350}