001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.documents;
029
030import org.opencms.file.CmsObject;
031import org.opencms.file.CmsResource;
032import org.opencms.main.CmsException;
033import org.opencms.search.I_CmsSearchDocument;
034import org.opencms.search.I_CmsSearchIndex;
035
036import java.util.List;
037
038/**
039 * Used to create index Lucene Documents for OpenCms resources,
040 * controls the text extraction algorithm used for a specific OpenCms resource type / MIME type combination.<p>
041 *
042 * The configuration of the search index is defined in <code>opencms-search.xml</code>.
043 * There you can associate a combintion of OpenCms resource types and MIME types to an instance
044 * of this factory. This rather complex configuration is required because only the combination of
045 * OpenCms resource type and MIME type can decide what to use for search indexing.
046 * For example, if the OpenCms resource type is <code>plain</code>,
047 * the extraction algorithm for MIME types <code>.html</code> and <code>.txt</code> must be different.
048 * On the other hand, the MIME type <code>.html</code> in OpenCms can be almost any resource type,
049 * like <code>xmlpage</code>, <code>xmlcontent</code> or even <code>jsp</code>.<p>
050 *
051 * @since 6.0.0
052 */
053public interface I_CmsDocumentFactory extends I_CmsSearchExtractor {
054
055    /**
056     * Creates the Lucene Document for the given VFS resource and the given search index.<p>
057     *
058     * This triggers the indexing process for the given VFS resource according to the configuration
059     * of the provided index.<p>
060     *
061     * The provided index resource contains the basic contents to index.
062     * The provided search index contains the configuration what to index, such as the locale and
063     * possible special field mappings.<p>
064     *
065     * @param cms the OpenCms user context used to access the OpenCms VFS
066     * @param resource the search index resource to create the Lucene document from
067     * @param index the search index to create the Document for
068     *
069     * @return the Search Document for the given index resource and the given search index
070     *
071     * @throws CmsException if something goes wrong
072     *
073     * @see org.opencms.search.fields.CmsSearchFieldConfiguration#createDocument(CmsObject, CmsResource, I_CmsSearchIndex, org.opencms.search.extractors.I_CmsExtractionResult)
074     */
075    I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index)
076    throws CmsException;
077
078    /**
079     * Returns the disk based cache used to store the raw extraction results.<p>
080     *
081     * In case <code>null</code> is returned, then result caching is not supported for this factory.<p>
082     *
083     * @return the disk based cache used to store the raw extraction results
084     */
085    CmsExtractionResultCache getCache();
086
087    /**
088     * Returns the list of accepted keys for the resource types that can be indexed using this document factory.<p>
089     *
090     * The result List contains String objects.
091     * This String is later matched against {@link A_CmsVfsDocument#getDocumentKey(String, String)} to find
092     * the corrospondig {@link I_CmsDocumentFactory} for a resource to index.<p>
093     *
094     * The list of accepted resource types may contain a catch-all entry "*";
095     * in this case, a list for all possible resource types is returned,
096     * calculated by a logic depending on the document handler class.<p>
097     *
098     * @param resourceTypes list of accepted resource types
099     * @param mimeTypes list of accepted mime types
100     *
101     * @return the list of accepted keys for the resource types that can be indexed using this document factory (String objects)
102     *
103     * @throws CmsException if something goes wrong
104     */
105    List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException;
106
107    /**
108     * Returns the name of this document type factory.<p>
109     *
110     * @return the name of this document type factory
111     */
112    String getName();
113
114    /**
115     * Returns <code>true</code> if this document factory is locale depended.<p>
116     *
117     * @return <code>true</code> if this document factory is locale depended
118     */
119    boolean isLocaleDependend();
120
121    /**
122     * Returns <code>true</code> if result caching is supported for this factory.<p>
123     *
124     * @return <code>true</code> if result caching is supported for this factory
125     */
126    boolean isUsingCache();
127
128    /**
129     * Sets the disk based cache used to store the raw extraction results.<p>
130     *
131     * This should only be used for factories where {@link #isUsingCache()} returns <code>true</code>.<p>
132     *
133     * @param cache the disk based cache used to store the raw extraction results
134     */
135    void setCache(CmsExtractionResultCache cache);
136}