001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search; 029 030import org.opencms.db.CmsPublishedResource; 031import org.opencms.file.CmsObject; 032import org.opencms.file.CmsProject; 033import org.opencms.file.CmsResource; 034import org.opencms.file.CmsResourceFilter; 035import org.opencms.main.CmsException; 036import org.opencms.main.CmsLog; 037import org.opencms.report.I_CmsReport; 038import org.opencms.security.CmsSecurityException; 039import org.opencms.util.CmsUUID; 040 041import java.io.IOException; 042import java.util.ArrayList; 043import java.util.Iterator; 044import java.util.List; 045 046import org.apache.commons.logging.Log; 047 048/** 049 * An indexer indexing {@link CmsResource} based content from the OpenCms VFS.<p> 050 * 051 * @since 6.0.0 052 */ 053public class CmsVfsIndexer implements I_CmsIndexer { 054 055 /** The log object for this class. */ 056 private static final Log LOG = CmsLog.getLog(CmsVfsIndexer.class); 057 058 // Note: The following member variables must all be "protected" (not "private") since 059 // in case the indexer is extended, the factory method "newInstance()" needs to set them. 060 061 /** The OpenCms user context to use when reading resources from the VFS during indexing. */ 062 protected CmsObject m_cms; 063 064 /** The index. */ 065 protected I_CmsSearchIndex m_index; 066 067 /** The report. */ 068 protected I_CmsReport m_report; 069 070 /** 071 * @see org.opencms.search.I_CmsIndexer#deleteResources(org.opencms.search.I_CmsIndexWriter, java.util.List) 072 */ 073 public void deleteResources(I_CmsIndexWriter indexWriter, List<CmsPublishedResource> resourcesToDelete) { 074 075 if ((resourcesToDelete == null) || resourcesToDelete.isEmpty()) { 076 // nothing to delete 077 return; 078 } 079 080 // contains all resources already deleted to avoid multiple deleting in case of siblings 081 List<CmsUUID> resourcesAlreadyDeleted = new ArrayList<CmsUUID>(resourcesToDelete.size()); 082 083 Iterator<CmsPublishedResource> i = resourcesToDelete.iterator(); 084 while (i.hasNext()) { 085 // iterate all resources in the given list of resources to delete 086 CmsPublishedResource res = i.next(); 087 if (!resourcesAlreadyDeleted.contains(res.getStructureId())) { 088 // ensure siblings are only deleted once per update 089 resourcesAlreadyDeleted.add(res.getStructureId()); 090 if (!res.isFolder() && !CmsResource.isTemporaryFileName(res.getRootPath())) { 091 // now delete the resource from the index 092 deleteResource(indexWriter, res); 093 } 094 } 095 } 096 } 097 098 /** 099 * Returns the OpenCms user context used by this indexer.<p> 100 * 101 * @return the OpenCms user context used by this indexer 102 */ 103 public CmsObject getCms() { 104 105 return m_cms; 106 } 107 108 /** 109 * Returns the OpenCms search index updated by this indexer.<p> 110 * 111 * @return the OpenCms search index updated by this indexer 112 */ 113 public I_CmsSearchIndex getIndex() { 114 115 return m_index; 116 } 117 118 /** 119 * Returns the report used by this indexer.<p> 120 * 121 * @return the report used by this indexer 122 */ 123 public I_CmsReport getReport() { 124 125 return m_report; 126 } 127 128 /** 129 * @see org.opencms.search.I_CmsIndexer#getUpdateData(org.opencms.search.CmsSearchIndexSource, java.util.List) 130 */ 131 public CmsSearchIndexUpdateData getUpdateData( 132 CmsSearchIndexSource source, 133 List<CmsPublishedResource> publishedResources) { 134 135 // create a new update collection from this indexer and the given index source 136 CmsSearchIndexUpdateData result = new CmsSearchIndexUpdateData(source, this); 137 138 Iterator<CmsPublishedResource> i = publishedResources.iterator(); 139 while (i.hasNext()) { 140 // check all published resources if they match this indexer / source 141 CmsPublishedResource pubRes = i.next(); 142 // VFS resources will always have a structure id 143 if (!pubRes.getStructureId().isNullUUID()) { 144 // use utility method from CmsProject to check if published resource is "inside" this index source 145 if (CmsProject.isInsideProject(source.getResourcesNames(), pubRes.getRootPath())) { 146 // the resource is "inside" this index source 147 addResourceToUpdateData(pubRes, result); 148 } 149 } 150 } 151 return result; 152 } 153 154 /** 155 * The default indexer is not able to resolve locale dependencies between documents.<p> 156 * 157 * @see org.opencms.search.I_CmsIndexer#isLocaleDependenciesEnable() 158 */ 159 public boolean isLocaleDependenciesEnable() { 160 161 return false; 162 } 163 164 /** 165 * @see org.opencms.search.I_CmsIndexer#newInstance(org.opencms.file.CmsObject, org.opencms.report.I_CmsReport, org.opencms.search.I_CmsSearchIndex) 166 */ 167 public I_CmsIndexer newInstance(CmsObject cms, I_CmsReport report, I_CmsSearchIndex index) { 168 169 CmsVfsIndexer indexer = null; 170 try { 171 indexer = getClass().newInstance(); 172 indexer.m_cms = cms; 173 indexer.m_report = report; 174 indexer.m_index = index; 175 } catch (Exception e) { 176 LOG.error( 177 Messages.get().getBundle().key( 178 Messages.ERR_INDEXSOURCE_INDEXER_CLASS_NAME_2, 179 getClass().getName(), 180 CmsVfsIndexer.class), 181 e); 182 } 183 return indexer; 184 } 185 186 /** 187 * @see org.opencms.search.I_CmsIndexer#rebuildIndex(org.opencms.search.I_CmsIndexWriter, org.opencms.search.CmsIndexingThreadManager, org.opencms.search.CmsSearchIndexSource) 188 */ 189 public void rebuildIndex( 190 I_CmsIndexWriter writer, 191 CmsIndexingThreadManager threadManager, 192 CmsSearchIndexSource source) { 193 194 List<String> resourceNames = source.getResourcesNames(); 195 Iterator<String> i = resourceNames.iterator(); 196 while (i.hasNext()) { 197 // read the resources from all configured source folders 198 String resourceName = i.next(); 199 List<CmsResource> resources = null; 200 try { 201 // read all resources (only files) below the given path 202 resources = m_cms.readResources(resourceName, CmsResourceFilter.IGNORE_EXPIRATION.addRequireFile()); 203 } catch (CmsException e) { 204 if (m_report != null) { 205 m_report.println( 206 Messages.get().container( 207 Messages.RPT_UNABLE_TO_READ_SOURCE_2, 208 resourceName, 209 e.getLocalizedMessage()), 210 I_CmsReport.FORMAT_WARNING); 211 } 212 if (LOG.isWarnEnabled()) { 213 LOG.warn( 214 Messages.get().getBundle().key( 215 Messages.LOG_UNABLE_TO_READ_SOURCE_2, 216 resourceName, 217 m_index.getName()), 218 e); 219 } 220 } 221 if (resources != null) { 222 // iterate all resources found in the folder 223 Iterator<CmsResource> j = resources.iterator(); 224 while (j.hasNext()) { 225 // now update all the resources individually 226 CmsResource resource = j.next(); 227 updateResource(writer, threadManager, resource); 228 } 229 } 230 } 231 } 232 233 /** 234 * @see org.opencms.search.I_CmsIndexer#updateResources(org.opencms.search.I_CmsIndexWriter, org.opencms.search.CmsIndexingThreadManager, java.util.List) 235 */ 236 public void updateResources( 237 I_CmsIndexWriter writer, 238 CmsIndexingThreadManager threadManager, 239 List<CmsPublishedResource> resourcesToUpdate) { 240 241 if ((resourcesToUpdate == null) || resourcesToUpdate.isEmpty()) { 242 // nothing to update 243 return; 244 } 245 246 // contains all resources already updated to avoid multiple updates in case of siblings 247 List<String> resourcesAlreadyUpdated = new ArrayList<String>(resourcesToUpdate.size()); 248 249 // index all resources that are in the given list 250 Iterator<CmsPublishedResource> i = resourcesToUpdate.iterator(); 251 while (i.hasNext()) { 252 CmsPublishedResource res = i.next(); 253 CmsResource resource = null; 254 if (!CmsResource.isTemporaryFileName(res.getRootPath())) { 255 try { 256 resource = m_cms.readResource(res.getRootPath(), CmsResourceFilter.IGNORE_EXPIRATION); 257 } catch (CmsException e) { 258 if (LOG.isWarnEnabled()) { 259 LOG.warn( 260 Messages.get().getBundle().key( 261 Messages.LOG_UNABLE_TO_READ_RESOURCE_2, 262 res.getRootPath(), 263 m_index.getName()), 264 e); 265 } 266 } 267 if (resource != null) { 268 if (!resourcesAlreadyUpdated.contains(resource.getRootPath())) { 269 // ensure resources are only indexed once per update 270 resourcesAlreadyUpdated.add(resource.getRootPath()); 271 updateResource(writer, threadManager, resource); 272 } 273 if (resource.isFolder()) { 274 try { 275 CmsResource defaultFile = m_cms.readDefaultFile( 276 resource, 277 CmsResourceFilter.ONLY_VISIBLE_NO_DELETED); 278 if ((defaultFile != null) && !resourcesAlreadyUpdated.contains(defaultFile.getRootPath())) { 279 if (LOG.isDebugEnabled()) { 280 LOG.warn( 281 Messages.get().getBundle().key( 282 Messages.LOG_INDEXING_DEFAULT_FILE_FOR_FOLDER_3, 283 defaultFile.getRootPath(), 284 res.getRootPath(), 285 m_index.getName())); 286 } 287 updateResource(writer, threadManager, defaultFile); 288 } 289 } catch (CmsSecurityException e) { 290 if (LOG.isWarnEnabled()) { 291 LOG.warn( 292 Messages.get().getBundle().key( 293 Messages.LOG_UNABLE_TO_READ_DEFAULT_FILE_FOR_FOLDER_2, 294 res.getRootPath(), 295 m_index.getName()), 296 e); 297 } 298 } 299 300 } 301 } 302 } 303 } 304 } 305 306 /** 307 * Adds a given published resource to the provided search index update data.<p> 308 * 309 * This method decides if the resource has to be included in the "update" or "delete" list.<p> 310 * 311 * @param pubRes the published resource to add 312 * @param updateData the search index update data to add the resource to 313 */ 314 protected void addResourceToUpdateData(CmsPublishedResource pubRes, CmsSearchIndexUpdateData updateData) { 315 316 if (pubRes.getState().isDeleted()) { 317 // deleted resource just needs to be removed 318 updateData.addResourceToDelete(pubRes); 319 } else if (pubRes.getState().isNew() || pubRes.getState().isChanged() || pubRes.getState().isUnchanged()) { 320 updateData.addResourceToUpdate(pubRes); 321 } 322 } 323 324 /** 325 * Deletes a resource with the given index writer.<p> 326 * 327 * @param indexWriter the index writer to resource the resource with 328 * @param resource the root path of the resource to delete 329 */ 330 protected void deleteResource(I_CmsIndexWriter indexWriter, CmsPublishedResource resource) { 331 332 try { 333 if (LOG.isInfoEnabled()) { 334 LOG.info(Messages.get().getBundle().key(Messages.LOG_DELETING_FROM_INDEX_1, resource.getRootPath())); 335 } 336 // delete all documents with this term from the index 337 indexWriter.deleteDocument(resource); 338 } catch (IOException e) { 339 if (LOG.isWarnEnabled()) { 340 LOG.warn( 341 Messages.get().getBundle().key( 342 Messages.LOG_IO_INDEX_DOCUMENT_DELETE_2, 343 resource.getRootPath(), 344 m_index.getName()), 345 e); 346 } 347 } 348 } 349 350 /** 351 * Checks if the published resource is inside the time window set with release and expiration date.<p> 352 * 353 * @param resource the published resource to check 354 * @return true if the published resource is inside the time window, otherwise false 355 */ 356 protected boolean isResourceInTimeWindow(CmsPublishedResource resource) { 357 358 return m_cms.existsResource( 359 m_cms.getRequestContext().removeSiteRoot(resource.getRootPath()), 360 CmsResourceFilter.DEFAULT); 361 } 362 363 /** 364 * Updates (writes) a single resource in the index.<p> 365 * 366 * @param writer the index writer to use 367 * @param threadManager the thread manager to use when extracting the document text 368 * @param resource the resource to update 369 */ 370 protected void updateResource( 371 I_CmsIndexWriter writer, 372 CmsIndexingThreadManager threadManager, 373 CmsResource resource) { 374 375 if (resource.isFolder() || resource.isTemporaryFile()) { 376 // don't ever index folders or temporary files 377 return; 378 } 379 try { 380 // create the index thread for the resource 381 threadManager.createIndexingThread(this, writer, resource); 382 } catch (Throwable e) { 383 // Only runtime exceptions can appear here. 384 if (m_report != null) { 385 m_report.println( 386 Messages.get().container(Messages.RPT_SEARCH_INDEXING_FAILED_0), 387 I_CmsReport.FORMAT_WARNING); 388 } 389 if (LOG.isWarnEnabled()) { 390 LOG.warn( 391 Messages.get().getBundle().key( 392 Messages.ERR_INDEX_RESOURCE_FAILED_2, 393 resource.getRootPath(), 394 m_index.getName()), 395 e); 396 } 397 } 398 } 399 400 /** 401 * Updates a resource with the given index writer and the new document provided.<p> 402 * 403 * @param indexWriter the index writer to update the resource with 404 * @param rootPath the root path of the resource to update 405 * @param doc the new document for the resource 406 */ 407 protected void updateResource(I_CmsIndexWriter indexWriter, String rootPath, I_CmsSearchDocument doc) { 408 409 try { 410 indexWriter.updateDocument(rootPath, doc); 411 } catch (Exception e) { 412 if (LOG.isWarnEnabled()) { 413 LOG.warn( 414 Messages.get().getBundle().key( 415 Messages.LOG_IO_INDEX_DOCUMENT_UPDATE_2, 416 rootPath, 417 m_index.getName()), 418 e); 419 } 420 } 421 } 422}