001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.documents;
029
030import org.opencms.file.CmsFile;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsResource;
033import org.opencms.main.CmsException;
034import org.opencms.main.CmsLog;
035import org.opencms.search.I_CmsSearchDocument;
036import org.opencms.search.I_CmsSearchIndex;
037import org.opencms.search.extractors.I_CmsExtractionResult;
038
039import java.util.ArrayList;
040import java.util.Iterator;
041import java.util.List;
042
043import org.apache.commons.logging.Log;
044
045/**
046 * Base document factory class for a VFS <code>{@link org.opencms.file.CmsResource}</code>,
047 * just requires a specialized implementation of
048 * <code>{@link I_CmsDocumentFactory#extractContent(CmsObject, CmsResource, I_CmsSearchIndex)}</code>
049 * for text extraction from the binary document content.<p>
050 *
051 * @since 6.0.0
052 */
053public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory {
054
055    /**
056     * Generic type name used as default for all types that are globally unconfigured.
057     * Note that any special xml content is already configured if xmlcontent is configured.
058     */
059    public static final String DEFAULT_ALL_UNCONFIGURED_TYPES = "__unconfigured__";
060    /** Generic type name used as default for all types. */
061    public static final String DEFAULT_ALL_TYPES = "__all__";
062    /** The log object for this class. */
063    private static final Log LOG = CmsLog.getLog(A_CmsVfsDocument.class);
064
065    /** Name of the document type. */
066    protected String m_name;
067
068    /** The cache used for storing extracted documents. */
069    private CmsExtractionResultCache m_cache;
070
071    /**
072     * Creates a new instance of this lucene document factory.<p>
073     *
074     * @param name name of the documenttype
075     */
076    public A_CmsVfsDocument(String name) {
077
078        m_name = name;
079    }
080
081    /**
082     * Creates a document factory lookup key for the given resource type name / MIME type configuration.<p>
083     *
084     * If the given <code>mimeType</code> is <code>null</code>, this indicates that the key should
085     * match all VFS resource of the given resource type regardless of the MIME type.<p>
086     *
087     * @param type the resource type name to use
088     * @param mimeType the MIME type to use
089     *
090     * @return a document factory lookup key for the given resource id / MIME type configuration
091     */
092    public static String getDocumentKey(String type, String mimeType) {
093
094        StringBuffer result = new StringBuffer(16);
095        result.append(I_CmsSearchDocument.VFS_DOCUMENT_KEY_PREFIX);
096        result.append('_');
097        result.append(type);
098        if (mimeType != null) {
099            result.append(':');
100            result.append(mimeType);
101        }
102        return result.toString();
103    }
104
105    /**
106     * Generates a new lucene document instance from contents of the given resource for the provided index.<p>
107     *
108     * @see org.opencms.search.documents.I_CmsDocumentFactory#createDocument(CmsObject, CmsResource, I_CmsSearchIndex)
109     */
110    public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index)
111    throws CmsException {
112
113        // extract the content from the resource
114        I_CmsExtractionResult content = null;
115
116        if (index.isExtractingContent()) {
117            // do full text content extraction only if required
118
119            // check if caching is enabled for this document type
120            CmsExtractionResultCache cache = getCache();
121            String cacheName = null;
122            if ((cache != null) && (resource.getSiblingCount() > 1)) {
123                // hard drive based caching only makes sense for resources that have siblings,
124                // because the index will also store the content as a blob
125                cacheName = cache.getCacheName(
126                    resource,
127                    isLocaleDependend() ? index.getLocaleForResource(cms, resource, null) : null,
128                    getName());
129                content = cache.getCacheObject(cacheName);
130            }
131
132            if (content == null) {
133                // extraction result has not been found in the cache
134                // use the currently indexed content, if it is still up to date.
135                content = index.getContentIfUnchanged(resource);
136            }
137
138            if (content == null) {
139                // extraction result has not been attached to the resource
140                try {
141                    content = extractContent(cms, resource, index);
142                    if (LOG.isDebugEnabled()) {
143                        LOG.debug("Extracting content for '" + resource.getRootPath() + "' successful.");
144                    }
145                    if ((cache != null) && (resource.getSiblingCount() > 1)) {
146                        // save extracted content to the cache
147                        cache.saveCacheObject(cacheName, content);
148                    }
149                } catch (CmsIndexNoContentException e) {
150                    // there was no content found for the resource
151                    LOG.info(
152                        Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath())
153                            + " "
154                            + e.getMessage());
155                } catch (Throwable e) {
156                    // text extraction failed for document - continue indexing meta information only
157                    LOG.error(
158                        Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()),
159                        e);
160                }
161            }
162        }
163
164        // create the Lucene document according to the index field configuration
165        return index.getFieldConfiguration().createDocument(cms, resource, index, content);
166    }
167
168    /**
169     * @see org.opencms.search.documents.I_CmsDocumentFactory#getCache()
170     */
171    public CmsExtractionResultCache getCache() {
172
173        return m_cache;
174    }
175
176    /**
177     * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List)
178     */
179    public List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException {
180
181        List<String> keys = new ArrayList<String>();
182
183        try {
184            for (Iterator<String> i = resourceTypes.iterator(); i.hasNext();) {
185
186                String typeName = i.next();
187                if (typeName.equals("*")) {
188                    typeName = DEFAULT_ALL_UNCONFIGURED_TYPES;
189                }
190                if (typeName.equals("**")) {
191                    typeName = DEFAULT_ALL_TYPES;
192                }
193                for (Iterator<String> j = mimeTypes.iterator(); j.hasNext();) {
194                    keys.add(getDocumentKey(typeName, j.next()));
195                }
196                if (mimeTypes.isEmpty()) {
197                    keys.add(getDocumentKey(typeName, null));
198                }
199            }
200        } catch (Exception exc) {
201            throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc);
202        }
203
204        return keys;
205    }
206
207    /**
208     * @see org.opencms.search.documents.I_CmsDocumentFactory#getName()
209     */
210    public String getName() {
211
212        return m_name;
213    }
214
215    /**
216     * @see org.opencms.search.documents.I_CmsDocumentFactory#setCache(org.opencms.search.documents.CmsExtractionResultCache)
217     */
218    public void setCache(CmsExtractionResultCache cache) {
219
220        m_cache = cache;
221    }
222
223    /**
224     * Logs content extraction for the specified resource and index.<p>
225     *
226     * @param resource the resource to log content extraction for
227     * @param index the search index to log content extraction for
228     */
229    protected void logContentExtraction(CmsResource resource, I_CmsSearchIndex index) {
230
231        if (LOG.isDebugEnabled()) {
232            LOG.debug(
233                Messages.get().getBundle().key(
234                    Messages.LOG_EXTRACT_CONTENT_2,
235                    resource.getRootPath(),
236                    index.getName()));
237        }
238    }
239
240    /**
241     * Upgrades the given resource to a {@link CmsFile} with content.<p>
242     *
243     * @param cms the current users OpenCms context
244     * @param resource the resource to upgrade
245     *
246     * @return the given resource upgraded to a {@link CmsFile} with content
247     *
248     * @throws CmsException if the resource could not be read
249     * @throws CmsIndexNoContentException if the resource has no content
250     */
251    protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexNoContentException {
252
253        CmsFile file = cms.readFile(resource);
254        if (file.getLength() <= 0) {
255            throw new CmsIndexNoContentException(
256                Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath()));
257        }
258        return file;
259    }
260}