001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.documents; 029 030import org.opencms.cache.CmsVfsDiskCache; 031import org.opencms.file.CmsResource; 032import org.opencms.main.CmsLog; 033import org.opencms.search.extractors.CmsExtractionResult; 034import org.opencms.search.extractors.I_CmsExtractionResult; 035import org.opencms.util.CmsFileUtil; 036 037import java.io.File; 038import java.io.IOException; 039import java.util.Locale; 040 041import org.apache.commons.logging.Log; 042 043/** 044 * Implements a disk cache that stores text extraction results in the RFS.<p> 045 * 046 * This cache operates on resource file names, plus a hash code calculated from 047 * {@link org.opencms.file.CmsResource#getDateLastModified()} 048 * and {@link org.opencms.file.CmsResource#getLength()}. Optional a locale can be appended to this name.<p> 049 * 050 * Since text extraction is done only on the content of a resource, all siblings must have the same content. 051 * The difference can be only by the locale setting in case of an XML content or XML page. However, 052 * the most problematic contents to extract for the search are in fact the MS Office and PDF formats. 053 * For these documents, all siblings must produce the exact same text extraction result.<p> 054 * 055 * This cache is usable for resources from the online AND the offline project at the same time, 056 * because any change to a resource will result in a changed hash code. This means a resource changed in the offline 057 * project will have a new hash code compared to the online project. If the resource is identical in the online and 058 * the offline project, the generated hash codes will be the same.<p> 059 * 060 * @since 6.2.0 061 */ 062public class CmsExtractionResultCache { 063 064 /** The log object for this class. */ 065 private static final Log LOG = CmsLog.getLog(CmsExtractionResultCache.class); 066 067 /** The name of the cache base repository folder in the RFS. */ 068 private String m_rfsRepository; 069 070 /** 071 * Creates a new disk cache.<p> 072 * 073 * @param basepath the base path for the cache in the RFS 074 * @param foldername the folder name for this cache, to be used a subfolder for the base folder 075 */ 076 public CmsExtractionResultCache(String basepath, String foldername) { 077 078 // normalize the given folder name 079 m_rfsRepository = CmsFileUtil.normalizePath(basepath + foldername + File.separatorChar); 080 } 081 082 /** 083 * Removes all expired extraction result cache entries from the RFS cache.<p> 084 * 085 * @param maxAge the maximum age of the extraction result cache files in hours (or fractions of hours) 086 * 087 * @return the total number of deleted resources 088 */ 089 public synchronized int cleanCache(float maxAge) { 090 091 // calculate oldest possible date for the cache files 092 long expireDate = System.currentTimeMillis() - (long)(maxAge * 60.0f * 60.0f * 1000.0f); 093 File basedir = new File(m_rfsRepository); 094 // perform the cache cleanup 095 int count = 0; 096 if (basedir.canRead() && basedir.isDirectory()) { 097 File[] files = basedir.listFiles(); 098 if (files != null) { 099 for (int i = 0; i < files.length; i++) { 100 File f = files[i]; 101 if (f.canWrite()) { 102 if (f.lastModified() < expireDate) { 103 try { 104 f.delete(); 105 count++; 106 } catch (Exception e) { 107 if (LOG.isWarnEnabled()) { 108 LOG.warn( 109 Messages.get().getBundle().key( 110 Messages.LOG_EXCERPT_CACHE_DELETE_ERROR_1, 111 f.getAbsolutePath()), 112 e); 113 } 114 } 115 } 116 } 117 } 118 } 119 } 120 return count; 121 } 122 123 /** 124 * Returns the RFS name used for caching an the text extraction result 125 * based on the given VFS resource and locale.<p> 126 * 127 * @param resource the VFS resource to generate the cache name for 128 * @param locale the locale to generate the cache name for (may be <code>null</code>) 129 * @param docTypeName the name of the search document type 130 * 131 * @return the RFS name to use for caching the given VFS resource with parameters 132 */ 133 public String getCacheName(CmsResource resource, Locale locale, String docTypeName) { 134 135 // create a StringBuffer for the result 136 StringBuffer buf = new StringBuffer(m_rfsRepository.length() + 36); 137 buf.append(m_rfsRepository); 138 buf.append('/'); 139 buf.append(resource.getResourceId().toString()); 140 141 if (docTypeName != null) { 142 buf.append('_'); 143 buf.append(docTypeName); 144 } 145 146 // check if parameters are provided, if so add them as well 147 if (locale != null) { 148 buf.append('_'); 149 buf.append(locale.toString()); 150 } 151 152 // append the date of last content modification to the result buffer 153 // please note that we need only worry about last change in content, since properties are ignored here 154 buf.append('_'); 155 buf.append(resource.getDateContent()); 156 157 // finally append the extension 158 buf.append(".ext"); 159 return buf.toString(); 160 } 161 162 /** 163 * Returns the extraction result in the requested file in the disk cache, or <code>null</code> if the 164 * file is not found in the cache, or is found but out-dated.<p> 165 * 166 * @param rfsName the file RFS name to look up in the cache 167 * 168 * @return the extraction result stored in the requested file in the RFS disk cache, or <code>null</code> 169 */ 170 public CmsExtractionResult getCacheObject(String rfsName) { 171 172 try { 173 File f = new File(rfsName); 174 if (f.exists()) { 175 long age = f.lastModified(); 176 if ((System.currentTimeMillis() - age) > 3600000) { 177 // file has not been touched for 1 hour, touch the file with the current date 178 f.setLastModified(System.currentTimeMillis()); 179 } 180 byte[] byteContent = CmsFileUtil.readFile(f); 181 return CmsExtractionResult.fromBytes(byteContent); 182 } 183 } catch (IOException e) { 184 // unable to read content 185 } 186 // this code can be reached only in case of an error 187 return null; 188 } 189 190 /** 191 * Returns the absolute path of the cache repository in the RFS.<p> 192 * 193 * @return the absolute path of the cache repository in the RFS 194 */ 195 public String getRepositoryPath() { 196 197 return m_rfsRepository; 198 } 199 200 /** 201 * Serializes the given extraction result and saves it in the disk cache.<p> 202 * 203 * @param rfsName the RFS name of the file to save the extraction result in 204 * @param content the extraction result to serialize and save 205 * 206 * @throws IOException in case of disk access errors 207 */ 208 public void saveCacheObject(String rfsName, I_CmsExtractionResult content) throws IOException { 209 210 byte[] byteContent = content.getBytes(); 211 if (byteContent != null) { 212 CmsVfsDiskCache.saveFile(rfsName, byteContent); 213 } 214 } 215}