001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.documents; 029 030import org.opencms.file.CmsFile; 031import org.opencms.file.CmsObject; 032import org.opencms.file.CmsResource; 033import org.opencms.main.CmsException; 034import org.opencms.main.CmsLog; 035import org.opencms.search.I_CmsSearchDocument; 036import org.opencms.search.I_CmsSearchIndex; 037import org.opencms.search.extractors.I_CmsExtractionResult; 038 039import java.util.ArrayList; 040import java.util.Iterator; 041import java.util.List; 042 043import org.apache.commons.logging.Log; 044 045/** 046 * Base document factory class for a VFS <code>{@link org.opencms.file.CmsResource}</code>, 047 * just requires a specialized implementation of 048 * <code>{@link I_CmsDocumentFactory#extractContent(CmsObject, CmsResource, I_CmsSearchIndex)}</code> 049 * for text extraction from the binary document content.<p> 050 * 051 * @since 6.0.0 052 */ 053public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory { 054 055 /** 056 * Generic type name used as default for all types that are globally unconfigured. 057 * Note that any special xml content is already configured if xmlcontent is configured. 058 */ 059 public static final String DEFAULT_ALL_UNCONFIGURED_TYPES = "__unconfigured__"; 060 /** Generic type name used as default for all types. */ 061 public static final String DEFAULT_ALL_TYPES = "__all__"; 062 /** The log object for this class. */ 063 private static final Log LOG = CmsLog.getLog(A_CmsVfsDocument.class); 064 065 /** Name of the document type. */ 066 protected String m_name; 067 068 /** The cache used for storing extracted documents. */ 069 private CmsExtractionResultCache m_cache; 070 071 /** 072 * Creates a new instance of this lucene document factory.<p> 073 * 074 * @param name name of the documenttype 075 */ 076 public A_CmsVfsDocument(String name) { 077 078 m_name = name; 079 } 080 081 /** 082 * Creates a document factory lookup key for the given resource type name / MIME type configuration.<p> 083 * 084 * If the given <code>mimeType</code> is <code>null</code>, this indicates that the key should 085 * match all VFS resource of the given resource type regardless of the MIME type.<p> 086 * 087 * @param type the resource type name to use 088 * @param mimeType the MIME type to use 089 * 090 * @return a document factory lookup key for the given resource id / MIME type configuration 091 */ 092 public static String getDocumentKey(String type, String mimeType) { 093 094 StringBuffer result = new StringBuffer(16); 095 result.append(I_CmsSearchDocument.VFS_DOCUMENT_KEY_PREFIX); 096 result.append('_'); 097 result.append(type); 098 if (mimeType != null) { 099 result.append(':'); 100 result.append(mimeType); 101 } 102 return result.toString(); 103 } 104 105 /** 106 * {@inheritDoc} 107 * 108 * @see org.opencms.search.documents.I_CmsDocumentFactory#createDocument(CmsObject, CmsResource, I_CmsSearchIndex) 109 */ 110 public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index) 111 throws CmsException { 112 113 // extract the content from the resource 114 I_CmsExtractionResult content = null; 115 116 if (index.isExtractingContent()) { 117 // do full text content extraction only if required 118 119 // check if caching is enabled for this document type 120 CmsExtractionResultCache cache = getCache(); 121 String cacheName = null; 122 if ((cache != null) && (resource.getSiblingCount() > 1)) { 123 // hard drive based caching only makes sense for resources that have siblings, 124 // because the index will also store the content as a blob 125 cacheName = cache.getCacheName( 126 resource, 127 isLocaleDependend() ? index.getLocaleForResource(cms, resource, null) : null, 128 getName()); 129 content = cache.getCacheObject(cacheName); 130 if ((content != null) && LOG.isDebugEnabled()) { 131 LOG.debug("Not re-extracting. Using cached content for '" + resource.getRootPath() + "'."); 132 } 133 } 134 135 if ((content == null) && isOnlyDependentOnContent()) { 136 // extraction result has not been found in the cache 137 // use the currently indexed content, if it is still up to date. 138 content = index.getContentIfUnchanged(resource); 139 if ((content != null) && LOG.isDebugEnabled()) { 140 LOG.debug( 141 "Not re-extracting. Using previously indexed content for '" + resource.getRootPath() + "'."); 142 } 143 } 144 145 if (content == null) { 146 // extraction result has not been attached to the resource 147 try { 148 content = extractContent(cms, resource, index); 149 if (LOG.isDebugEnabled()) { 150 LOG.debug("Extracting content for '" + resource.getRootPath() + "' successful."); 151 } 152 if ((cache != null) && (resource.getSiblingCount() > 1)) { 153 // save extracted content to the cache 154 cache.saveCacheObject(cacheName, content); 155 } 156 } catch (CmsIndexNoContentException e) { 157 // there was no content found for the resource 158 LOG.info( 159 Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()) 160 + " " 161 + e.getMessage()); 162 } catch (Throwable e) { 163 // text extraction failed for document - continue indexing meta information only 164 LOG.error( 165 Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), 166 e); 167 } 168 } 169 } 170 171 // create the Lucene document according to the index field configuration 172 return index.getFieldConfiguration().createDocument(cms, resource, index, content); 173 } 174 175 /** 176 * @see org.opencms.search.documents.I_CmsDocumentFactory#getCache() 177 */ 178 public CmsExtractionResultCache getCache() { 179 180 return m_cache; 181 } 182 183 /** 184 * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List) 185 */ 186 public List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException { 187 188 List<String> keys = new ArrayList<String>(); 189 190 try { 191 for (Iterator<String> i = resourceTypes.iterator(); i.hasNext();) { 192 193 String typeName = i.next(); 194 if (typeName.equals("*")) { 195 typeName = DEFAULT_ALL_UNCONFIGURED_TYPES; 196 } 197 if (typeName.equals("**")) { 198 typeName = DEFAULT_ALL_TYPES; 199 } 200 for (Iterator<String> j = mimeTypes.iterator(); j.hasNext();) { 201 keys.add(getDocumentKey(typeName, j.next())); 202 } 203 if (mimeTypes.isEmpty()) { 204 keys.add(getDocumentKey(typeName, null)); 205 } 206 } 207 } catch (Exception exc) { 208 throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc); 209 } 210 211 return keys; 212 } 213 214 /** 215 * @see org.opencms.search.documents.I_CmsDocumentFactory#getName() 216 */ 217 public String getName() { 218 219 return m_name; 220 } 221 222 /** 223 * @see org.opencms.search.documents.I_CmsDocumentFactory#setCache(org.opencms.search.documents.CmsExtractionResultCache) 224 */ 225 public void setCache(CmsExtractionResultCache cache) { 226 227 m_cache = cache; 228 } 229 230 /** 231 * Logs content extraction for the specified resource and index.<p> 232 * 233 * @param resource the resource to log content extraction for 234 * @param index the search index to log content extraction for 235 */ 236 protected void logContentExtraction(CmsResource resource, I_CmsSearchIndex index) { 237 238 if (LOG.isDebugEnabled()) { 239 LOG.debug( 240 Messages.get().getBundle().key( 241 Messages.LOG_EXTRACT_CONTENT_2, 242 resource.getRootPath(), 243 index.getName())); 244 } 245 } 246 247 /** 248 * Upgrades the given resource to a {@link CmsFile} with content.<p> 249 * 250 * @param cms the current users OpenCms context 251 * @param resource the resource to upgrade 252 * 253 * @return the given resource upgraded to a {@link CmsFile} with content 254 * 255 * @throws CmsException if the resource could not be read 256 * @throws CmsIndexNoContentException if the resource has no content 257 */ 258 protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexNoContentException { 259 260 CmsFile file = cms.readFile(resource); 261 if (file.getLength() <= 0) { 262 throw new CmsIndexNoContentException( 263 Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath())); 264 } 265 return file; 266 } 267}