001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.documents;
029
030import org.opencms.file.CmsFile;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsResource;
033import org.opencms.main.CmsException;
034import org.opencms.main.CmsLog;
035import org.opencms.search.I_CmsSearchDocument;
036import org.opencms.search.I_CmsSearchIndex;
037import org.opencms.search.extractors.I_CmsExtractionResult;
038
039import java.util.ArrayList;
040import java.util.Iterator;
041import java.util.List;
042
043import org.apache.commons.logging.Log;
044
045/**
046 * Base document factory class for a VFS <code>{@link org.opencms.file.CmsResource}</code>,
047 * just requires a specialized implementation of
048 * <code>{@link I_CmsDocumentFactory#extractContent(CmsObject, CmsResource, I_CmsSearchIndex)}</code>
049 * for text extraction from the binary document content.<p>
050 *
051 * @since 6.0.0
052 */
053public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory {
054
055    /**
056     * Generic type name used as default for all types that are globally unconfigured.
057     * Note that any special xml content is already configured if xmlcontent is configured.
058     */
059    public static final String DEFAULT_ALL_UNCONFIGURED_TYPES = "__unconfigured__";
060    /** Generic type name used as default for all types. */
061    public static final String DEFAULT_ALL_TYPES = "__all__";
062    /** The log object for this class. */
063    private static final Log LOG = CmsLog.getLog(A_CmsVfsDocument.class);
064
065    /** Name of the document type. */
066    protected String m_name;
067
068    /** The cache used for storing extracted documents. */
069    private CmsExtractionResultCache m_cache;
070
071    /**
072     * Creates a new instance of this lucene document factory.<p>
073     *
074     * @param name name of the documenttype
075     */
076    public A_CmsVfsDocument(String name) {
077
078        m_name = name;
079    }
080
081    /**
082     * Creates a document factory lookup key for the given resource type name / MIME type configuration.<p>
083     *
084     * If the given <code>mimeType</code> is <code>null</code>, this indicates that the key should
085     * match all VFS resource of the given resource type regardless of the MIME type.<p>
086     *
087     * @param type the resource type name to use
088     * @param mimeType the MIME type to use
089     *
090     * @return a document factory lookup key for the given resource id / MIME type configuration
091     */
092    public static String getDocumentKey(String type, String mimeType) {
093
094        StringBuffer result = new StringBuffer(16);
095        result.append(I_CmsSearchDocument.VFS_DOCUMENT_KEY_PREFIX);
096        result.append('_');
097        result.append(type);
098        if (mimeType != null) {
099            result.append(':');
100            result.append(mimeType);
101        }
102        return result.toString();
103    }
104
105    /**
106     * {@inheritDoc}
107     *
108     * @see org.opencms.search.documents.I_CmsDocumentFactory#createDocument(CmsObject, CmsResource, I_CmsSearchIndex)
109     */
110    public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index)
111    throws CmsException {
112
113        // extract the content from the resource
114        I_CmsExtractionResult content = null;
115
116        if (index.isExtractingContent()) {
117            // do full text content extraction only if required
118
119            // check if caching is enabled for this document type
120            CmsExtractionResultCache cache = getCache();
121            String cacheName = null;
122            if ((cache != null) && (resource.getSiblingCount() > 1)) {
123                // hard drive based caching only makes sense for resources that have siblings,
124                // because the index will also store the content as a blob
125                cacheName = cache.getCacheName(
126                    resource,
127                    isLocaleDependend() ? index.getLocaleForResource(cms, resource, null) : null,
128                    getName());
129                content = cache.getCacheObject(cacheName);
130                if ((content != null) && LOG.isDebugEnabled()) {
131                    LOG.debug("Not re-extracting. Using cached content for '" + resource.getRootPath() + "'.");
132                }
133            }
134
135            if ((content == null) && isOnlyDependentOnContent()) {
136                // extraction result has not been found in the cache
137                // use the currently indexed content, if it is still up to date.
138                content = index.getContentIfUnchanged(resource);
139                if ((content != null) && LOG.isDebugEnabled()) {
140                    LOG.debug(
141                        "Not re-extracting. Using previously indexed content for '" + resource.getRootPath() + "'.");
142                }
143            }
144
145            if (content == null) {
146                // extraction result has not been attached to the resource
147                try {
148                    content = extractContent(cms, resource, index);
149                    if (LOG.isDebugEnabled()) {
150                        LOG.debug("Extracting content for '" + resource.getRootPath() + "' successful.");
151                    }
152                    if ((cache != null) && (resource.getSiblingCount() > 1)) {
153                        // save extracted content to the cache
154                        cache.saveCacheObject(cacheName, content);
155                    }
156                } catch (CmsIndexNoContentException e) {
157                    // there was no content found for the resource
158                    LOG.info(
159                        Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath())
160                            + " "
161                            + e.getMessage());
162                } catch (Throwable e) {
163                    // text extraction failed for document - continue indexing meta information only
164                    LOG.error(
165                        Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()),
166                        e);
167                }
168            }
169        }
170
171        // create the Lucene document according to the index field configuration
172        return index.getFieldConfiguration().createDocument(cms, resource, index, content);
173    }
174
175    /**
176     * @see org.opencms.search.documents.I_CmsDocumentFactory#getCache()
177     */
178    public CmsExtractionResultCache getCache() {
179
180        return m_cache;
181    }
182
183    /**
184     * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List)
185     */
186    public List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException {
187
188        List<String> keys = new ArrayList<String>();
189
190        try {
191            for (Iterator<String> i = resourceTypes.iterator(); i.hasNext();) {
192
193                String typeName = i.next();
194                if (typeName.equals("*")) {
195                    typeName = DEFAULT_ALL_UNCONFIGURED_TYPES;
196                }
197                if (typeName.equals("**")) {
198                    typeName = DEFAULT_ALL_TYPES;
199                }
200                for (Iterator<String> j = mimeTypes.iterator(); j.hasNext();) {
201                    keys.add(getDocumentKey(typeName, j.next()));
202                }
203                if (mimeTypes.isEmpty()) {
204                    keys.add(getDocumentKey(typeName, null));
205                }
206            }
207        } catch (Exception exc) {
208            throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc);
209        }
210
211        return keys;
212    }
213
214    /**
215     * @see org.opencms.search.documents.I_CmsDocumentFactory#getName()
216     */
217    public String getName() {
218
219        return m_name;
220    }
221
222    /**
223     * @see org.opencms.search.documents.I_CmsDocumentFactory#setCache(org.opencms.search.documents.CmsExtractionResultCache)
224     */
225    public void setCache(CmsExtractionResultCache cache) {
226
227        m_cache = cache;
228    }
229
230    /**
231     * Logs content extraction for the specified resource and index.<p>
232     *
233     * @param resource the resource to log content extraction for
234     * @param index the search index to log content extraction for
235     */
236    protected void logContentExtraction(CmsResource resource, I_CmsSearchIndex index) {
237
238        if (LOG.isDebugEnabled()) {
239            LOG.debug(
240                Messages.get().getBundle().key(
241                    Messages.LOG_EXTRACT_CONTENT_2,
242                    resource.getRootPath(),
243                    index.getName()));
244        }
245    }
246
247    /**
248     * Upgrades the given resource to a {@link CmsFile} with content.<p>
249     *
250     * @param cms the current users OpenCms context
251     * @param resource the resource to upgrade
252     *
253     * @return the given resource upgraded to a {@link CmsFile} with content
254     *
255     * @throws CmsException if the resource could not be read
256     * @throws CmsIndexNoContentException if the resource has no content
257     */
258    protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexNoContentException {
259
260        CmsFile file = cms.readFile(resource);
261        if (file.getLength() <= 0) {
262            throw new CmsIndexNoContentException(
263                Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath()));
264        }
265        return file;
266    }
267}