001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.extractors;
029
030import java.util.Collection;
031import java.util.LinkedHashMap;
032import java.util.List;
033import java.util.Locale;
034import java.util.Map;
035
036/**
037 * The result of a document text extraction.<p>
038 *
039 * This data structure contains the extracted text as well as (optional)
040 * meta information extracted from the document.<p>
041 *
042 * @since 6.0.0
043 */
044public interface I_CmsExtractionResult {
045
046    /** Key to access the document author name in the item map. */
047    String ITEM_AUTHOR = "author";
048
049    /** Key to access the document category in the item map. */
050    String ITEM_CATEGORY = "category";
051
052    /** Key to access the document comments in the item map. */
053    String ITEM_COMMENTS = "comments";
054
055    /** Key to access the document company name in the item map. */
056    String ITEM_COMPANY = "company";
057
058    /** Key for accessing the default (combined) content in {@link #getContentItems()}. */
059    String ITEM_CONTENT = "__content";
060
061    /** Key to access the document creator name in the item map. */
062    String ITEM_CREATOR = "creator";
063
064    /** Key to access the document keywords in the item map. */
065    String ITEM_KEYWORDS = "keywords";
066
067    /** Key to access the document manager name in the item map. */
068    String ITEM_MANAGER = "manager";
069
070    /** Key to access the document producer name in the item map. */
071    String ITEM_PRODUCER = "producer";
072
073    /** Key for accessing the raw content in {@link #getContentItems()}. */
074    String ITEM_RAW = "__raw";
075
076    /** Key to access the document subject in the item map. */
077    String ITEM_SUBJECT = "subject";
078
079    /** Key to access the document title in the item map. */
080    String ITEM_TITLE = "title";
081
082    /** All items that should be merged. */
083    String[] ITEMS_TO_MERGE = {ITEM_CONTENT};
084
085    /**
086     * Returns this extraction result serialized as a byte array.<p>
087     *
088     * @return this extraction result serialized as a byte array
089     */
090    byte[] getBytes();
091
092    /**
093     * Returns the extracted content of the best fitting locale combined as a String.<p>
094     *
095     * @return the extracted content of the best fitting locale combined as a String
096     */
097    String getContent();
098
099    /**
100     * Returns the extracted content for the given locale combined as a String.<p>
101     * @param locale the locale of the extracted content
102     *
103     * @return the extracted content for the given locale combined as a String
104     */
105    String getContent(Locale locale);
106
107    /**
108     * Returns the extracted content for the best fitting locale as individual items.<p>
109     *
110     * The result Map contains all content items extracted
111     * by the extractor. The key is always a String, and contains the name of the item.
112     * The value is also a String and contains the extracted text.<p>
113     *
114     * The detailed form will depend on the resource type indexed:
115     * <ul>
116     * <li>For a <code>xmlpage</code>, the key will be the element name, and the value
117     * will be the text of the element.
118     * <li>For a <code>xmlcontent</code>, the key will be the xpath of the XML node,
119     * and the value will be the text of that XML node.
120     * <li>In case the document contains meta information (for example PDF or MS Office documents),
121     * the meta information is stored with the name of the meta field as key and the content as value.
122     * <li>For all other resource types, there will be only ony key {@link #ITEM_CONTENT},
123     * which will contain the value of the complete content.
124     * </ul>
125     *
126     * The map has to be ordered to e.g., get the correct indexing order for search field mappings
127     * when a sequence of values is mapped to a multi-valued search field.
128     *
129     * @return the extracted content as individual items
130     */
131    LinkedHashMap<String, String> getContentItems();
132
133    /** <p>Returns the extracted content for a given locale as individual items.</p>
134     * @param locale the locale of the extracted content items
135     *
136     * @return the extracted content for a given locale as individual items.
137     *
138     * @see #getContentItems()
139     */
140    LinkedHashMap<String, String> getContentItems(Locale locale);
141
142    /** Returns the best fitting locale for the content.
143     * @return the best fitting locale for the content
144     */
145    Locale getDefaultLocale();
146
147    /**
148     * Returns a map from search fields to values that should be stored in that fields.
149     * @return A map from search fields to values that should be stored in that fields.
150     */
151    Map<String, String> getFieldMappings();
152
153    /** Returns the locales in which the content is available.
154     * @return the locales in which the content is available
155     */
156    Collection<Locale> getLocales();
157
158    /** Appends, for the locales of the current collection result, the content fields
159     * from all provided extraction results to the current extraction result.
160     *
161     * @param extractionResults the extraction results to merge
162     * @return the merged result
163     */
164    I_CmsExtractionResult merge(List<I_CmsExtractionResult> extractionResults);
165
166    /**
167     * Releases the information stored in this extraction result, to free up the memory used.<p>
168     */
169    void release();
170}