001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.extractors; 029 030import java.util.Collection; 031import java.util.LinkedHashMap; 032import java.util.List; 033import java.util.Locale; 034import java.util.Map; 035 036/** 037 * The result of a document text extraction.<p> 038 * 039 * This data structure contains the extracted text as well as (optional) 040 * meta information extracted from the document.<p> 041 * 042 * @since 6.0.0 043 */ 044public interface I_CmsExtractionResult { 045 046 /** Key to access the document author name in the item map. */ 047 String ITEM_AUTHOR = "author"; 048 049 /** Key to access the document category in the item map. */ 050 String ITEM_CATEGORY = "category"; 051 052 /** Key to access the document comments in the item map. */ 053 String ITEM_COMMENTS = "comments"; 054 055 /** Key to access the document company name in the item map. */ 056 String ITEM_COMPANY = "company"; 057 058 /** Key for accessing the default (combined) content in {@link #getContentItems()}. */ 059 String ITEM_CONTENT = "__content"; 060 061 /** Key to access the document creator name in the item map. */ 062 String ITEM_CREATOR = "creator"; 063 064 /** Key to access the document keywords in the item map. */ 065 String ITEM_KEYWORDS = "keywords"; 066 067 /** Key to access the document manager name in the item map. */ 068 String ITEM_MANAGER = "manager"; 069 070 /** Key to access the document producer name in the item map. */ 071 String ITEM_PRODUCER = "producer"; 072 073 /** Key for accessing the raw content in {@link #getContentItems()}. */ 074 String ITEM_RAW = "__raw"; 075 076 /** Key to access the document subject in the item map. */ 077 String ITEM_SUBJECT = "subject"; 078 079 /** Key to access the document title in the item map. */ 080 String ITEM_TITLE = "title"; 081 082 /** All items that should be merged. */ 083 String[] ITEMS_TO_MERGE = {ITEM_CONTENT}; 084 085 /** 086 * Returns this extraction result serialized as a byte array.<p> 087 * 088 * @return this extraction result serialized as a byte array 089 */ 090 byte[] getBytes(); 091 092 /** 093 * Returns the extracted content of the best fitting locale combined as a String.<p> 094 * 095 * @return the extracted content of the best fitting locale combined as a String 096 */ 097 String getContent(); 098 099 /** 100 * Returns the extracted content for the given locale combined as a String.<p> 101 * @param locale the locale of the extracted content 102 * 103 * @return the extracted content for the given locale combined as a String 104 */ 105 String getContent(Locale locale); 106 107 /** 108 * Returns the extracted content for the best fitting locale as individual items.<p> 109 * 110 * The result Map contains all content items extracted 111 * by the extractor. The key is always a String, and contains the name of the item. 112 * The value is also a String and contains the extracted text.<p> 113 * 114 * The detailed form will depend on the resource type indexed: 115 * <ul> 116 * <li>For a <code>xmlpage</code>, the key will be the element name, and the value 117 * will be the text of the element. 118 * <li>For a <code>xmlcontent</code>, the key will be the xpath of the XML node, 119 * and the value will be the text of that XML node. 120 * <li>In case the document contains meta information (for example PDF or MS Office documents), 121 * the meta information is stored with the name of the meta field as key and the content as value. 122 * <li>For all other resource types, there will be only ony key {@link #ITEM_CONTENT}, 123 * which will contain the value of the complete content. 124 * </ul> 125 * 126 * The map has to be ordered to e.g., get the correct indexing order for search field mappings 127 * when a sequence of values is mapped to a multi-valued search field. 128 * 129 * @return the extracted content as individual items 130 */ 131 LinkedHashMap<String, String> getContentItems(); 132 133 /** <p>Returns the extracted content for a given locale as individual items.</p> 134 * @param locale the locale of the extracted content items 135 * 136 * @return the extracted content for a given locale as individual items. 137 * 138 * @see #getContentItems() 139 */ 140 LinkedHashMap<String, String> getContentItems(Locale locale); 141 142 /** Returns the best fitting locale for the content. 143 * @return the best fitting locale for the content 144 */ 145 Locale getDefaultLocale(); 146 147 /** 148 * Returns a map from search fields to values that should be stored in that fields. 149 * @return A map from search fields to values that should be stored in that fields. 150 */ 151 Map<String, String> getFieldMappings(); 152 153 /** Returns the locales in which the content is available. 154 * @return the locales in which the content is available 155 */ 156 Collection<Locale> getLocales(); 157 158 /** Appends, for the locales of the current collection result, the content fields 159 * from all provided extraction results to the current extraction result. 160 * 161 * @param extractionResults the extraction results to merge 162 * @return the merged result 163 */ 164 I_CmsExtractionResult merge(List<I_CmsExtractionResult> extractionResults); 165 166 /** 167 * Releases the information stored in this extraction result, to free up the memory used.<p> 168 */ 169 void release(); 170}