001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search;
029
030import org.opencms.db.CmsPublishedResource;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsProject;
033import org.opencms.file.CmsResource;
034import org.opencms.file.CmsResourceFilter;
035import org.opencms.main.CmsException;
036import org.opencms.main.CmsLog;
037import org.opencms.report.I_CmsReport;
038import org.opencms.util.CmsUUID;
039
040import java.io.IOException;
041import java.util.ArrayList;
042import java.util.Iterator;
043import java.util.List;
044
045import org.apache.commons.logging.Log;
046
047/**
048 * An indexer indexing {@link CmsResource} based content from the OpenCms VFS.<p>
049 *
050 * @since 6.0.0
051 */
052public class CmsVfsIndexer implements I_CmsIndexer {
053
054    /** The log object for this class. */
055    private static final Log LOG = CmsLog.getLog(CmsVfsIndexer.class);
056
057    // Note: The following member variables must all be "protected" (not "private") since
058    // in case the indexer is extended, the factory method "newInstance()" needs to set them.
059
060    /** The OpenCms user context to use when reading resources from the VFS during indexing. */
061    protected CmsObject m_cms;
062
063    /** The index. */
064    protected I_CmsSearchIndex m_index;
065
066    /** The report. */
067    protected I_CmsReport m_report;
068
069    /**
070     * @see org.opencms.search.I_CmsIndexer#deleteResources(org.opencms.search.I_CmsIndexWriter, java.util.List)
071     */
072    public void deleteResources(I_CmsIndexWriter indexWriter, List<CmsPublishedResource> resourcesToDelete) {
073
074        if ((resourcesToDelete == null) || resourcesToDelete.isEmpty()) {
075            // nothing to delete
076            return;
077        }
078
079        // contains all resources already deleted to avoid multiple deleting in case of siblings
080        List<CmsUUID> resourcesAlreadyDeleted = new ArrayList<CmsUUID>(resourcesToDelete.size());
081
082        Iterator<CmsPublishedResource> i = resourcesToDelete.iterator();
083        while (i.hasNext()) {
084            // iterate all resources in the given list of resources to delete
085            CmsPublishedResource res = i.next();
086            if (!resourcesAlreadyDeleted.contains(res.getStructureId())) {
087                // ensure siblings are only deleted once per update
088                resourcesAlreadyDeleted.add(res.getStructureId());
089                if (!res.isFolder() && !CmsResource.isTemporaryFileName(res.getRootPath())) {
090                    // now delete the resource from the index
091                    deleteResource(indexWriter, res);
092                }
093            }
094        }
095    }
096
097    /**
098     * Returns the OpenCms user context used by this indexer.<p>
099     *
100     * @return the OpenCms user context used by this indexer
101     */
102    public CmsObject getCms() {
103
104        return m_cms;
105    }
106
107    /**
108     * Returns the OpenCms search index updated by this indexer.<p>
109     *
110     * @return the OpenCms search index updated by this indexer
111     */
112    public I_CmsSearchIndex getIndex() {
113
114        return m_index;
115    }
116
117    /**
118     * Returns the report used by this indexer.<p>
119     *
120     * @return the report used by this indexer
121     */
122    public I_CmsReport getReport() {
123
124        return m_report;
125    }
126
127    /**
128     * @see org.opencms.search.I_CmsIndexer#getUpdateData(org.opencms.search.CmsSearchIndexSource, java.util.List)
129     */
130    public CmsSearchIndexUpdateData getUpdateData(
131        CmsSearchIndexSource source,
132        List<CmsPublishedResource> publishedResources) {
133
134        // create a new update collection from this indexer and the given index source
135        CmsSearchIndexUpdateData result = new CmsSearchIndexUpdateData(source, this);
136
137        Iterator<CmsPublishedResource> i = publishedResources.iterator();
138        while (i.hasNext()) {
139            // check all published resources if they match this indexer / source
140            CmsPublishedResource pubRes = i.next();
141            // VFS resources will always have a structure id
142            if (!pubRes.getStructureId().isNullUUID()) {
143                // use utility method from CmsProject to check if published resource is "inside" this index source
144                if (CmsProject.isInsideProject(source.getResourcesNames(), pubRes.getRootPath())) {
145                    // the resource is "inside" this index source
146                    addResourceToUpdateData(pubRes, result);
147                }
148            }
149        }
150        return result;
151    }
152
153    /**
154     * The default indexer is not able to resolve locale dependencies between documents.<p>
155     *
156     * @see org.opencms.search.I_CmsIndexer#isLocaleDependenciesEnable()
157     */
158    public boolean isLocaleDependenciesEnable() {
159
160        return false;
161    }
162
163    /**
164     * @see org.opencms.search.I_CmsIndexer#newInstance(org.opencms.file.CmsObject, org.opencms.report.I_CmsReport, org.opencms.search.I_CmsSearchIndex)
165     */
166    public I_CmsIndexer newInstance(CmsObject cms, I_CmsReport report, I_CmsSearchIndex index) {
167
168        CmsVfsIndexer indexer = null;
169        try {
170            indexer = getClass().newInstance();
171            indexer.m_cms = cms;
172            indexer.m_report = report;
173            indexer.m_index = index;
174        } catch (Exception e) {
175            LOG.error(
176                Messages.get().getBundle().key(
177                    Messages.ERR_INDEXSOURCE_INDEXER_CLASS_NAME_2,
178                    getClass().getName(),
179                    CmsVfsIndexer.class),
180                e);
181        }
182        return indexer;
183    }
184
185    /**
186     * @see org.opencms.search.I_CmsIndexer#rebuildIndex(org.opencms.search.I_CmsIndexWriter, org.opencms.search.CmsIndexingThreadManager, org.opencms.search.CmsSearchIndexSource)
187     */
188    public void rebuildIndex(
189        I_CmsIndexWriter writer,
190        CmsIndexingThreadManager threadManager,
191        CmsSearchIndexSource source) {
192
193        List<String> resourceNames = source.getResourcesNames();
194        Iterator<String> i = resourceNames.iterator();
195        while (i.hasNext()) {
196            // read the resources from all configured source folders
197            String resourceName = i.next();
198            List<CmsResource> resources = null;
199            try {
200                // read all resources (only files) below the given path
201                resources = m_cms.readResources(resourceName, CmsResourceFilter.IGNORE_EXPIRATION.addRequireFile());
202            } catch (CmsException e) {
203                if (m_report != null) {
204                    m_report.println(
205                        Messages.get().container(
206                            Messages.RPT_UNABLE_TO_READ_SOURCE_2,
207                            resourceName,
208                            e.getLocalizedMessage()),
209                        I_CmsReport.FORMAT_WARNING);
210                }
211                if (LOG.isWarnEnabled()) {
212                    LOG.warn(
213                        Messages.get().getBundle().key(
214                            Messages.LOG_UNABLE_TO_READ_SOURCE_2,
215                            resourceName,
216                            m_index.getName()),
217                        e);
218                }
219            }
220            if (resources != null) {
221                // iterate all resources found in the folder
222                Iterator<CmsResource> j = resources.iterator();
223                while (j.hasNext()) {
224                    // now update all the resources individually
225                    CmsResource resource = j.next();
226                    updateResource(writer, threadManager, resource);
227                }
228            }
229        }
230    }
231
232    /**
233     * @see org.opencms.search.I_CmsIndexer#updateResources(org.opencms.search.I_CmsIndexWriter, org.opencms.search.CmsIndexingThreadManager, java.util.List)
234     */
235    public void updateResources(
236        I_CmsIndexWriter writer,
237        CmsIndexingThreadManager threadManager,
238        List<CmsPublishedResource> resourcesToUpdate) {
239
240        if ((resourcesToUpdate == null) || resourcesToUpdate.isEmpty()) {
241            // nothing to update
242            return;
243        }
244
245        // contains all resources already updated to avoid multiple updates in case of siblings
246        List<String> resourcesAlreadyUpdated = new ArrayList<String>(resourcesToUpdate.size());
247
248        // index all resources that are in the given list
249        Iterator<CmsPublishedResource> i = resourcesToUpdate.iterator();
250        while (i.hasNext()) {
251            CmsPublishedResource res = i.next();
252            CmsResource resource = null;
253            if (!CmsResource.isTemporaryFileName(res.getRootPath())) {
254                try {
255                    resource = m_cms.readResource(res.getRootPath(), CmsResourceFilter.IGNORE_EXPIRATION);
256                } catch (CmsException e) {
257                    if (LOG.isWarnEnabled()) {
258                        LOG.warn(
259                            Messages.get().getBundle().key(
260                                Messages.LOG_UNABLE_TO_READ_RESOURCE_2,
261                                res.getRootPath(),
262                                m_index.getName()),
263                            e);
264                    }
265                }
266
267                if (resource != null) {
268                    if (!resourcesAlreadyUpdated.contains(resource.getRootPath())) {
269                        // ensure resources are only indexed once per update
270                        resourcesAlreadyUpdated.add(resource.getRootPath());
271                        updateResource(writer, threadManager, resource);
272                    }
273                }
274            }
275        }
276    }
277
278    /**
279     * Adds a given published resource to the provided search index update data.<p>
280     *
281     * This method decides if the resource has to be included in the "update" or "delete" list.<p>
282     *
283     * @param pubRes the published resource to add
284     * @param updateData the search index update data to add the resource to
285     */
286    protected void addResourceToUpdateData(CmsPublishedResource pubRes, CmsSearchIndexUpdateData updateData) {
287
288        if (pubRes.getState().isDeleted()) {
289            // deleted resource just needs to be removed
290            updateData.addResourceToDelete(pubRes);
291        } else if (pubRes.getState().isNew() || pubRes.getState().isChanged() || pubRes.getState().isUnchanged()) {
292            updateData.addResourceToUpdate(pubRes);
293        }
294    }
295
296    /**
297     * Deletes a resource with the given index writer.<p>
298     *
299     * @param indexWriter the index writer to resource the resource with
300     * @param resource the root path of the resource to delete
301     */
302    protected void deleteResource(I_CmsIndexWriter indexWriter, CmsPublishedResource resource) {
303
304        try {
305            if (LOG.isInfoEnabled()) {
306                LOG.info(Messages.get().getBundle().key(Messages.LOG_DELETING_FROM_INDEX_1, resource.getRootPath()));
307            }
308            // delete all documents with this term from the index
309            indexWriter.deleteDocument(resource);
310        } catch (IOException e) {
311            if (LOG.isWarnEnabled()) {
312                LOG.warn(
313                    Messages.get().getBundle().key(
314                        Messages.LOG_IO_INDEX_DOCUMENT_DELETE_2,
315                        resource.getRootPath(),
316                        m_index.getName()),
317                    e);
318            }
319        }
320    }
321
322    /**
323     * Checks if the published resource is inside the time window set with release and expiration date.<p>
324     *
325     * @param resource the published resource to check
326     * @return true if the published resource is inside the time window, otherwise false
327     */
328    protected boolean isResourceInTimeWindow(CmsPublishedResource resource) {
329
330        return m_cms.existsResource(
331            m_cms.getRequestContext().removeSiteRoot(resource.getRootPath()),
332            CmsResourceFilter.DEFAULT);
333    }
334
335    /**
336     * Updates (writes) a single resource in the index.<p>
337     *
338     * @param writer the index writer to use
339     * @param threadManager the thread manager to use when extracting the document text
340     * @param resource the resource to update
341     */
342    protected void updateResource(
343        I_CmsIndexWriter writer,
344        CmsIndexingThreadManager threadManager,
345        CmsResource resource) {
346
347        if (resource.isFolder() || resource.isTemporaryFile()) {
348            // don't ever index folders or temporary files
349            return;
350        }
351        try {
352            // create the index thread for the resource
353            threadManager.createIndexingThread(this, writer, resource);
354        } catch (Throwable e) {
355            // Only runtime exceptions can appear here.
356            if (m_report != null) {
357                m_report.println(
358                    Messages.get().container(Messages.RPT_SEARCH_INDEXING_FAILED_0),
359                    I_CmsReport.FORMAT_WARNING);
360            }
361            if (LOG.isWarnEnabled()) {
362                LOG.warn(
363                    Messages.get().getBundle().key(
364                        Messages.ERR_INDEX_RESOURCE_FAILED_2,
365                        resource.getRootPath(),
366                        m_index.getName()),
367                    e);
368            }
369        }
370    }
371
372    /**
373     * Updates a resource with the given index writer and the new document provided.<p>
374     *
375     * @param indexWriter the index writer to update the resource with
376     * @param rootPath the root path of the resource to update
377     * @param doc the new document for the resource
378     */
379    protected void updateResource(I_CmsIndexWriter indexWriter, String rootPath, I_CmsSearchDocument doc) {
380
381        try {
382            indexWriter.updateDocument(rootPath, doc);
383        } catch (Exception e) {
384            if (LOG.isWarnEnabled()) {
385                LOG.warn(
386                    Messages.get().getBundle().key(
387                        Messages.LOG_IO_INDEX_DOCUMENT_UPDATE_2,
388                        rootPath,
389                        m_index.getName()),
390                    e);
391            }
392        }
393    }
394}