001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search;
029
030import org.opencms.db.CmsPublishedResource;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsProject;
033import org.opencms.file.CmsResource;
034import org.opencms.file.CmsResourceFilter;
035import org.opencms.main.CmsException;
036import org.opencms.main.CmsLog;
037import org.opencms.report.I_CmsReport;
038import org.opencms.security.CmsSecurityException;
039import org.opencms.util.CmsUUID;
040
041import java.io.IOException;
042import java.util.ArrayList;
043import java.util.Iterator;
044import java.util.List;
045
046import org.apache.commons.logging.Log;
047
048/**
049 * An indexer indexing {@link CmsResource} based content from the OpenCms VFS.<p>
050 *
051 * @since 6.0.0
052 */
053public class CmsVfsIndexer implements I_CmsIndexer {
054
055    /** The log object for this class. */
056    private static final Log LOG = CmsLog.getLog(CmsVfsIndexer.class);
057
058    // Note: The following member variables must all be "protected" (not "private") since
059    // in case the indexer is extended, the factory method "newInstance()" needs to set them.
060
061    /** The OpenCms user context to use when reading resources from the VFS during indexing. */
062    protected CmsObject m_cms;
063
064    /** The index. */
065    protected I_CmsSearchIndex m_index;
066
067    /** The report. */
068    protected I_CmsReport m_report;
069
070    /**
071     * @see org.opencms.search.I_CmsIndexer#deleteResources(org.opencms.search.I_CmsIndexWriter, java.util.List)
072     */
073    public void deleteResources(I_CmsIndexWriter indexWriter, List<CmsPublishedResource> resourcesToDelete) {
074
075        if ((resourcesToDelete == null) || resourcesToDelete.isEmpty()) {
076            // nothing to delete
077            return;
078        }
079
080        // contains all resources already deleted to avoid multiple deleting in case of siblings
081        List<CmsUUID> resourcesAlreadyDeleted = new ArrayList<CmsUUID>(resourcesToDelete.size());
082
083        Iterator<CmsPublishedResource> i = resourcesToDelete.iterator();
084        while (i.hasNext()) {
085            // iterate all resources in the given list of resources to delete
086            CmsPublishedResource res = i.next();
087            if (!resourcesAlreadyDeleted.contains(res.getStructureId())) {
088                // ensure siblings are only deleted once per update
089                resourcesAlreadyDeleted.add(res.getStructureId());
090                if (!res.isFolder() && !CmsResource.isTemporaryFileName(res.getRootPath())) {
091                    // now delete the resource from the index
092                    deleteResource(indexWriter, res);
093                }
094            }
095        }
096    }
097
098    /**
099     * Returns the OpenCms user context used by this indexer.<p>
100     *
101     * @return the OpenCms user context used by this indexer
102     */
103    public CmsObject getCms() {
104
105        return m_cms;
106    }
107
108    /**
109     * Returns the OpenCms search index updated by this indexer.<p>
110     *
111     * @return the OpenCms search index updated by this indexer
112     */
113    public I_CmsSearchIndex getIndex() {
114
115        return m_index;
116    }
117
118    /**
119     * Returns the report used by this indexer.<p>
120     *
121     * @return the report used by this indexer
122     */
123    public I_CmsReport getReport() {
124
125        return m_report;
126    }
127
128    /**
129     * @see org.opencms.search.I_CmsIndexer#getUpdateData(org.opencms.search.CmsSearchIndexSource, java.util.List)
130     */
131    public CmsSearchIndexUpdateData getUpdateData(
132        CmsSearchIndexSource source,
133        List<CmsPublishedResource> publishedResources) {
134
135        // create a new update collection from this indexer and the given index source
136        CmsSearchIndexUpdateData result = new CmsSearchIndexUpdateData(source, this);
137
138        Iterator<CmsPublishedResource> i = publishedResources.iterator();
139        while (i.hasNext()) {
140            // check all published resources if they match this indexer / source
141            CmsPublishedResource pubRes = i.next();
142            // VFS resources will always have a structure id
143            if (!pubRes.getStructureId().isNullUUID()) {
144                // use utility method from CmsProject to check if published resource is "inside" this index source
145                if (CmsProject.isInsideProject(source.getResourcesNames(), pubRes.getRootPath())) {
146                    // the resource is "inside" this index source
147                    addResourceToUpdateData(pubRes, result);
148                }
149            }
150        }
151        return result;
152    }
153
154    /**
155     * The default indexer is not able to resolve locale dependencies between documents.<p>
156     *
157     * @see org.opencms.search.I_CmsIndexer#isLocaleDependenciesEnable()
158     */
159    public boolean isLocaleDependenciesEnable() {
160
161        return false;
162    }
163
164    /**
165     * @see org.opencms.search.I_CmsIndexer#newInstance(org.opencms.file.CmsObject, org.opencms.report.I_CmsReport, org.opencms.search.I_CmsSearchIndex)
166     */
167    public I_CmsIndexer newInstance(CmsObject cms, I_CmsReport report, I_CmsSearchIndex index) {
168
169        CmsVfsIndexer indexer = null;
170        try {
171            indexer = getClass().newInstance();
172            indexer.m_cms = cms;
173            indexer.m_report = report;
174            indexer.m_index = index;
175        } catch (Exception e) {
176            LOG.error(
177                Messages.get().getBundle().key(
178                    Messages.ERR_INDEXSOURCE_INDEXER_CLASS_NAME_2,
179                    getClass().getName(),
180                    CmsVfsIndexer.class),
181                e);
182        }
183        return indexer;
184    }
185
186    /**
187     * @see org.opencms.search.I_CmsIndexer#rebuildIndex(org.opencms.search.I_CmsIndexWriter, org.opencms.search.CmsIndexingThreadManager, org.opencms.search.CmsSearchIndexSource)
188     */
189    public void rebuildIndex(
190        I_CmsIndexWriter writer,
191        CmsIndexingThreadManager threadManager,
192        CmsSearchIndexSource source) {
193
194        List<String> resourceNames = source.getResourcesNames();
195        Iterator<String> i = resourceNames.iterator();
196        while (i.hasNext()) {
197            // read the resources from all configured source folders
198            String resourceName = i.next();
199            List<CmsResource> resources = null;
200            try {
201                // read all resources (only files) below the given path
202                resources = m_cms.readResources(resourceName, CmsResourceFilter.IGNORE_EXPIRATION.addRequireFile());
203            } catch (CmsException e) {
204                if (m_report != null) {
205                    m_report.println(
206                        Messages.get().container(
207                            Messages.RPT_UNABLE_TO_READ_SOURCE_2,
208                            resourceName,
209                            e.getLocalizedMessage()),
210                        I_CmsReport.FORMAT_WARNING);
211                }
212                if (LOG.isWarnEnabled()) {
213                    LOG.warn(
214                        Messages.get().getBundle().key(
215                            Messages.LOG_UNABLE_TO_READ_SOURCE_2,
216                            resourceName,
217                            m_index.getName()),
218                        e);
219                }
220            }
221            if (resources != null) {
222                // iterate all resources found in the folder
223                Iterator<CmsResource> j = resources.iterator();
224                while (j.hasNext()) {
225                    // now update all the resources individually
226                    CmsResource resource = j.next();
227                    updateResource(writer, threadManager, resource);
228                }
229            }
230        }
231    }
232
233    /**
234     * @see org.opencms.search.I_CmsIndexer#updateResources(org.opencms.search.I_CmsIndexWriter, org.opencms.search.CmsIndexingThreadManager, java.util.List)
235     */
236    public void updateResources(
237        I_CmsIndexWriter writer,
238        CmsIndexingThreadManager threadManager,
239        List<CmsPublishedResource> resourcesToUpdate) {
240
241        if ((resourcesToUpdate == null) || resourcesToUpdate.isEmpty()) {
242            // nothing to update
243            return;
244        }
245
246        // contains all resources already updated to avoid multiple updates in case of siblings
247        List<String> resourcesAlreadyUpdated = new ArrayList<String>(resourcesToUpdate.size());
248
249        // index all resources that are in the given list
250        Iterator<CmsPublishedResource> i = resourcesToUpdate.iterator();
251        while (i.hasNext()) {
252            CmsPublishedResource res = i.next();
253            CmsResource resource = null;
254            if (!CmsResource.isTemporaryFileName(res.getRootPath())) {
255                try {
256                    resource = m_cms.readResource(res.getRootPath(), CmsResourceFilter.IGNORE_EXPIRATION);
257                } catch (CmsException e) {
258                    if (LOG.isWarnEnabled()) {
259                        LOG.warn(
260                            Messages.get().getBundle().key(
261                                Messages.LOG_UNABLE_TO_READ_RESOURCE_2,
262                                res.getRootPath(),
263                                m_index.getName()),
264                            e);
265                    }
266                }
267                if (resource != null) {
268                    if (!resourcesAlreadyUpdated.contains(resource.getRootPath())) {
269                        // ensure resources are only indexed once per update
270                        resourcesAlreadyUpdated.add(resource.getRootPath());
271                        updateResource(writer, threadManager, resource);
272                    }
273                    if (resource.isFolder()) {
274                        try {
275                            CmsResource defaultFile = m_cms.readDefaultFile(
276                                resource,
277                                CmsResourceFilter.ONLY_VISIBLE_NO_DELETED);
278                            if ((defaultFile != null) && !resourcesAlreadyUpdated.contains(defaultFile.getRootPath())) {
279                                if (LOG.isDebugEnabled()) {
280                                    LOG.warn(
281                                        Messages.get().getBundle().key(
282                                            Messages.LOG_INDEXING_DEFAULT_FILE_FOR_FOLDER_3,
283                                            defaultFile.getRootPath(),
284                                            res.getRootPath(),
285                                            m_index.getName()));
286                                }
287                                updateResource(writer, threadManager, defaultFile);
288                            }
289                        } catch (CmsSecurityException e) {
290                            if (LOG.isWarnEnabled()) {
291                                LOG.warn(
292                                    Messages.get().getBundle().key(
293                                        Messages.LOG_UNABLE_TO_READ_DEFAULT_FILE_FOR_FOLDER_2,
294                                        res.getRootPath(),
295                                        m_index.getName()),
296                                    e);
297                            }
298                        }
299
300                    }
301                }
302            }
303        }
304    }
305
306    /**
307     * Adds a given published resource to the provided search index update data.<p>
308     *
309     * This method decides if the resource has to be included in the "update" or "delete" list.<p>
310     *
311     * @param pubRes the published resource to add
312     * @param updateData the search index update data to add the resource to
313     */
314    protected void addResourceToUpdateData(CmsPublishedResource pubRes, CmsSearchIndexUpdateData updateData) {
315
316        if (pubRes.getState().isDeleted()) {
317            // deleted resource just needs to be removed
318            updateData.addResourceToDelete(pubRes);
319        } else if (pubRes.getState().isNew() || pubRes.getState().isChanged() || pubRes.getState().isUnchanged()) {
320            updateData.addResourceToUpdate(pubRes);
321        }
322    }
323
324    /**
325     * Deletes a resource with the given index writer.<p>
326     *
327     * @param indexWriter the index writer to resource the resource with
328     * @param resource the root path of the resource to delete
329     */
330    protected void deleteResource(I_CmsIndexWriter indexWriter, CmsPublishedResource resource) {
331
332        try {
333            if (LOG.isInfoEnabled()) {
334                LOG.info(Messages.get().getBundle().key(Messages.LOG_DELETING_FROM_INDEX_1, resource.getRootPath()));
335            }
336            // delete all documents with this term from the index
337            indexWriter.deleteDocument(resource);
338        } catch (IOException e) {
339            if (LOG.isWarnEnabled()) {
340                LOG.warn(
341                    Messages.get().getBundle().key(
342                        Messages.LOG_IO_INDEX_DOCUMENT_DELETE_2,
343                        resource.getRootPath(),
344                        m_index.getName()),
345                    e);
346            }
347        }
348    }
349
350    /**
351     * Checks if the published resource is inside the time window set with release and expiration date.<p>
352     *
353     * @param resource the published resource to check
354     * @return true if the published resource is inside the time window, otherwise false
355     */
356    protected boolean isResourceInTimeWindow(CmsPublishedResource resource) {
357
358        return m_cms.existsResource(
359            m_cms.getRequestContext().removeSiteRoot(resource.getRootPath()),
360            CmsResourceFilter.DEFAULT);
361    }
362
363    /**
364     * Updates (writes) a single resource in the index.<p>
365     *
366     * @param writer the index writer to use
367     * @param threadManager the thread manager to use when extracting the document text
368     * @param resource the resource to update
369     */
370    protected void updateResource(
371        I_CmsIndexWriter writer,
372        CmsIndexingThreadManager threadManager,
373        CmsResource resource) {
374
375        if (resource.isFolder() || resource.isTemporaryFile()) {
376            // don't ever index folders or temporary files
377            return;
378        }
379        try {
380            // create the index thread for the resource
381            threadManager.createIndexingThread(this, writer, resource);
382        } catch (Throwable e) {
383            // Only runtime exceptions can appear here.
384            if (m_report != null) {
385                m_report.println(
386                    Messages.get().container(Messages.RPT_SEARCH_INDEXING_FAILED_0),
387                    I_CmsReport.FORMAT_WARNING);
388            }
389            if (LOG.isWarnEnabled()) {
390                LOG.warn(
391                    Messages.get().getBundle().key(
392                        Messages.ERR_INDEX_RESOURCE_FAILED_2,
393                        resource.getRootPath(),
394                        m_index.getName()),
395                    e);
396            }
397        }
398    }
399
400    /**
401     * Updates a resource with the given index writer and the new document provided.<p>
402     *
403     * @param indexWriter the index writer to update the resource with
404     * @param rootPath the root path of the resource to update
405     * @param doc the new document for the resource
406     */
407    protected void updateResource(I_CmsIndexWriter indexWriter, String rootPath, I_CmsSearchDocument doc) {
408
409        try {
410            indexWriter.updateDocument(rootPath, doc);
411        } catch (Exception e) {
412            if (LOG.isWarnEnabled()) {
413                LOG.warn(
414                    Messages.get().getBundle().key(
415                        Messages.LOG_IO_INDEX_DOCUMENT_UPDATE_2,
416                        rootPath,
417                        m_index.getName()),
418                    e);
419            }
420        }
421    }
422}