001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.documents;
029
030import org.opencms.cache.CmsVfsDiskCache;
031import org.opencms.file.CmsResource;
032import org.opencms.main.CmsLog;
033import org.opencms.search.extractors.CmsExtractionResult;
034import org.opencms.search.extractors.I_CmsExtractionResult;
035import org.opencms.util.CmsFileUtil;
036
037import java.io.File;
038import java.io.IOException;
039import java.util.Locale;
040
041import org.apache.commons.logging.Log;
042
043/**
044 * Implements a disk cache that stores text extraction results in the RFS.<p>
045 *
046 * This cache operates on resource file names, plus a hash code calculated from
047 * {@link org.opencms.file.CmsResource#getDateLastModified()}
048 * and {@link org.opencms.file.CmsResource#getLength()}. Optional a locale can be appended to this name.<p>
049 *
050 * Since text extraction is done only on the content of a resource, all siblings must have the same content.
051 * The difference can be only by the locale setting in case of an XML content or XML page. However,
052 * the most problematic contents to extract for the search are in fact the MS Office and PDF formats.
053 * For these documents, all siblings must produce the exact same text extraction result.<p>
054 *
055 * This cache is usable for resources from the online AND the offline project at the same time,
056 * because any change to a resource will result in a changed hash code. This means a resource changed in the offline
057 * project will have a new hash code compared to the online project. If the resource is identical in the online and
058 * the offline project, the generated hash codes will be the same.<p>
059 *
060 * @since 6.2.0
061 */
062public class CmsExtractionResultCache {
063
064    /** The log object for this class. */
065    private static final Log LOG = CmsLog.getLog(CmsExtractionResultCache.class);
066
067    /** The name of the cache base repository folder in the RFS. */
068    private String m_rfsRepository;
069
070    /**
071     * Creates a new disk cache.<p>
072     *
073     * @param basepath the base path for the cache in the RFS
074     * @param foldername the folder name for this cache, to be used a subfolder for the base folder
075     */
076    public CmsExtractionResultCache(String basepath, String foldername) {
077
078        // normalize the given folder name
079        m_rfsRepository = CmsFileUtil.normalizePath(basepath + foldername + File.separatorChar);
080    }
081
082    /**
083     * Removes all expired extraction result cache entries from the RFS cache.<p>
084     *
085     * @param maxAge the maximum age of the extraction result cache files in hours (or fractions of hours)
086     *
087     * @return the total number of deleted resources
088     */
089    public synchronized int cleanCache(float maxAge) {
090
091        // calculate oldest possible date for the cache files
092        long expireDate = System.currentTimeMillis() - (long)(maxAge * 60.0f * 60.0f * 1000.0f);
093        File basedir = new File(m_rfsRepository);
094        // perform the cache cleanup
095        int count = 0;
096        if (basedir.canRead() && basedir.isDirectory()) {
097            File[] files = basedir.listFiles();
098            if (files != null) {
099                for (int i = 0; i < files.length; i++) {
100                    File f = files[i];
101                    if (f.canWrite()) {
102                        if (f.lastModified() < expireDate) {
103                            try {
104                                f.delete();
105                                count++;
106                            } catch (Exception e) {
107                                if (LOG.isWarnEnabled()) {
108                                    LOG.warn(
109                                        Messages.get().getBundle().key(
110                                            Messages.LOG_EXCERPT_CACHE_DELETE_ERROR_1,
111                                            f.getAbsolutePath()),
112                                        e);
113                                }
114                            }
115                        }
116                    }
117                }
118            }
119        }
120        return count;
121    }
122
123    /**
124     * Returns the RFS name used for caching an the text extraction result
125     * based on the given VFS resource and locale.<p>
126     *
127     * @param resource the VFS resource to generate the cache name for
128     * @param locale the locale to generate the cache name for (may be <code>null</code>)
129     * @param docTypeName the name of the search document type
130     *
131     * @return the RFS name to use for caching the given VFS resource with parameters
132     */
133    public String getCacheName(CmsResource resource, Locale locale, String docTypeName) {
134
135        // create a StringBuffer for the result
136        StringBuffer buf = new StringBuffer(m_rfsRepository.length() + 36);
137        buf.append(m_rfsRepository);
138        buf.append('/');
139        buf.append(resource.getResourceId().toString());
140
141        if (docTypeName != null) {
142            buf.append('_');
143            buf.append(docTypeName);
144        }
145
146        // check if parameters are provided, if so add them as well
147        if (locale != null) {
148            buf.append('_');
149            buf.append(locale.toString());
150        }
151
152        // append the date of last content modification to the result buffer
153        // please note that we need only worry about last change in content, since properties are ignored here
154        buf.append('_');
155        buf.append(resource.getDateContent());
156
157        // finally append the extension
158        buf.append(".ext");
159        return buf.toString();
160    }
161
162    /**
163     * Returns the extraction result in the requested file in the disk cache, or <code>null</code> if the
164     * file is not found in the cache, or is found but out-dated.<p>
165     *
166     * @param rfsName the file RFS name to look up in the cache
167     *
168     * @return the extraction result stored in the requested file in the RFS disk cache, or <code>null</code>
169     */
170    public CmsExtractionResult getCacheObject(String rfsName) {
171
172        try {
173            File f = new File(rfsName);
174            if (f.exists()) {
175                long age = f.lastModified();
176                if ((System.currentTimeMillis() - age) > 3600000) {
177                    // file has not been touched for 1 hour, touch the file with the current date
178                    f.setLastModified(System.currentTimeMillis());
179                }
180                byte[] byteContent = CmsFileUtil.readFile(f);
181                return CmsExtractionResult.fromBytes(byteContent);
182            }
183        } catch (IOException e) {
184            // unable to read content
185        }
186        // this code can be reached only in case of an error
187        return null;
188    }
189
190    /**
191     * Returns the absolute path of the cache repository in the RFS.<p>
192     *
193     * @return the absolute path of the cache repository in the RFS
194     */
195    public String getRepositoryPath() {
196
197        return m_rfsRepository;
198    }
199
200    /**
201     * Serializes the given extraction result and saves it in the disk cache.<p>
202     *
203     * @param rfsName the RFS name of the file to save the extraction result in
204     * @param content the extraction result to serialize and save
205     *
206     * @throws IOException in case of disk access errors
207     */
208    public void saveCacheObject(String rfsName, I_CmsExtractionResult content) throws IOException {
209
210        byte[] byteContent = content.getBytes();
211        if (byteContent != null) {
212            CmsVfsDiskCache.saveFile(rfsName, byteContent);
213        }
214    }
215}