001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.documents; 029 030import org.opencms.file.CmsFile; 031import org.opencms.file.CmsObject; 032import org.opencms.file.CmsResource; 033import org.opencms.main.CmsException; 034import org.opencms.main.CmsLog; 035import org.opencms.search.I_CmsSearchDocument; 036import org.opencms.search.I_CmsSearchIndex; 037import org.opencms.search.extractors.I_CmsExtractionResult; 038 039import java.util.ArrayList; 040import java.util.Iterator; 041import java.util.List; 042 043import org.apache.commons.logging.Log; 044 045/** 046 * Base document factory class for a VFS <code>{@link org.opencms.file.CmsResource}</code>, 047 * just requires a specialized implementation of 048 * <code>{@link I_CmsDocumentFactory#extractContent(CmsObject, CmsResource, I_CmsSearchIndex)}</code> 049 * for text extraction from the binary document content.<p> 050 * 051 * @since 6.0.0 052 */ 053public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory { 054 055 /** 056 * Generic type name used as default for all types that are globally unconfigured. 057 * Note that any special xml content is already configured if xmlcontent is configured. 058 */ 059 public static final String DEFAULT_ALL_UNCONFIGURED_TYPES = "__unconfigured__"; 060 /** Generic type name used as default for all types. */ 061 public static final String DEFAULT_ALL_TYPES = "__all__"; 062 /** The log object for this class. */ 063 private static final Log LOG = CmsLog.getLog(A_CmsVfsDocument.class); 064 065 /** Name of the document type. */ 066 protected String m_name; 067 068 /** The cache used for storing extracted documents. */ 069 private CmsExtractionResultCache m_cache; 070 071 /** 072 * Creates a new instance of this lucene document factory.<p> 073 * 074 * @param name name of the documenttype 075 */ 076 public A_CmsVfsDocument(String name) { 077 078 m_name = name; 079 } 080 081 /** 082 * Creates a document factory lookup key for the given resource type name / MIME type configuration.<p> 083 * 084 * If the given <code>mimeType</code> is <code>null</code>, this indicates that the key should 085 * match all VFS resource of the given resource type regardless of the MIME type.<p> 086 * 087 * @param type the resource type name to use 088 * @param mimeType the MIME type to use 089 * 090 * @return a document factory lookup key for the given resource id / MIME type configuration 091 */ 092 public static String getDocumentKey(String type, String mimeType) { 093 094 StringBuffer result = new StringBuffer(16); 095 result.append(I_CmsSearchDocument.VFS_DOCUMENT_KEY_PREFIX); 096 result.append('_'); 097 result.append(type); 098 if (mimeType != null) { 099 result.append(':'); 100 result.append(mimeType); 101 } 102 return result.toString(); 103 } 104 105 /** 106 * Generates a new lucene document instance from contents of the given resource for the provided index.<p> 107 * 108 * @see org.opencms.search.documents.I_CmsDocumentFactory#createDocument(CmsObject, CmsResource, I_CmsSearchIndex) 109 */ 110 public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index) 111 throws CmsException { 112 113 // extract the content from the resource 114 I_CmsExtractionResult content = null; 115 116 if (index.isExtractingContent()) { 117 // do full text content extraction only if required 118 119 // check if caching is enabled for this document type 120 CmsExtractionResultCache cache = getCache(); 121 String cacheName = null; 122 if ((cache != null) && (resource.getSiblingCount() > 1)) { 123 // hard drive based caching only makes sense for resources that have siblings, 124 // because the index will also store the content as a blob 125 cacheName = cache.getCacheName( 126 resource, 127 isLocaleDependend() ? index.getLocaleForResource(cms, resource, null) : null, 128 getName()); 129 content = cache.getCacheObject(cacheName); 130 } 131 132 if (content == null) { 133 // extraction result has not been found in the cache 134 // use the currently indexed content, if it is still up to date. 135 content = index.getContentIfUnchanged(resource); 136 } 137 138 if (content == null) { 139 // extraction result has not been attached to the resource 140 try { 141 content = extractContent(cms, resource, index); 142 if (LOG.isDebugEnabled()) { 143 LOG.debug("Extracting content for '" + resource.getRootPath() + "' successful."); 144 } 145 if ((cache != null) && (resource.getSiblingCount() > 1)) { 146 // save extracted content to the cache 147 cache.saveCacheObject(cacheName, content); 148 } 149 } catch (CmsIndexNoContentException e) { 150 // there was no content found for the resource 151 LOG.info( 152 Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()) 153 + " " 154 + e.getMessage()); 155 } catch (Throwable e) { 156 // text extraction failed for document - continue indexing meta information only 157 LOG.error( 158 Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), 159 e); 160 } 161 } 162 } 163 164 // create the Lucene document according to the index field configuration 165 return index.getFieldConfiguration().createDocument(cms, resource, index, content); 166 } 167 168 /** 169 * @see org.opencms.search.documents.I_CmsDocumentFactory#getCache() 170 */ 171 public CmsExtractionResultCache getCache() { 172 173 return m_cache; 174 } 175 176 /** 177 * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List) 178 */ 179 public List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException { 180 181 List<String> keys = new ArrayList<String>(); 182 183 try { 184 for (Iterator<String> i = resourceTypes.iterator(); i.hasNext();) { 185 186 String typeName = i.next(); 187 if (typeName.equals("*")) { 188 typeName = DEFAULT_ALL_UNCONFIGURED_TYPES; 189 } 190 if (typeName.equals("**")) { 191 typeName = DEFAULT_ALL_TYPES; 192 } 193 for (Iterator<String> j = mimeTypes.iterator(); j.hasNext();) { 194 keys.add(getDocumentKey(typeName, j.next())); 195 } 196 if (mimeTypes.isEmpty()) { 197 keys.add(getDocumentKey(typeName, null)); 198 } 199 } 200 } catch (Exception exc) { 201 throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc); 202 } 203 204 return keys; 205 } 206 207 /** 208 * @see org.opencms.search.documents.I_CmsDocumentFactory#getName() 209 */ 210 public String getName() { 211 212 return m_name; 213 } 214 215 /** 216 * @see org.opencms.search.documents.I_CmsDocumentFactory#setCache(org.opencms.search.documents.CmsExtractionResultCache) 217 */ 218 public void setCache(CmsExtractionResultCache cache) { 219 220 m_cache = cache; 221 } 222 223 /** 224 * Logs content extraction for the specified resource and index.<p> 225 * 226 * @param resource the resource to log content extraction for 227 * @param index the search index to log content extraction for 228 */ 229 protected void logContentExtraction(CmsResource resource, I_CmsSearchIndex index) { 230 231 if (LOG.isDebugEnabled()) { 232 LOG.debug( 233 Messages.get().getBundle().key( 234 Messages.LOG_EXTRACT_CONTENT_2, 235 resource.getRootPath(), 236 index.getName())); 237 } 238 } 239 240 /** 241 * Upgrades the given resource to a {@link CmsFile} with content.<p> 242 * 243 * @param cms the current users OpenCms context 244 * @param resource the resource to upgrade 245 * 246 * @return the given resource upgraded to a {@link CmsFile} with content 247 * 248 * @throws CmsException if the resource could not be read 249 * @throws CmsIndexNoContentException if the resource has no content 250 */ 251 protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexNoContentException { 252 253 CmsFile file = cms.readFile(resource); 254 if (file.getLength() <= 0) { 255 throw new CmsIndexNoContentException( 256 Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath())); 257 } 258 return file; 259 } 260}