001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.solr.spellchecking; 029 030import org.opencms.file.CmsFile; 031import org.opencms.file.CmsObject; 032import org.opencms.file.CmsProject; 033import org.opencms.file.CmsRequestContext; 034import org.opencms.file.CmsResource; 035import org.opencms.file.CmsResourceFilter; 036import org.opencms.main.CmsException; 037import org.opencms.main.CmsLog; 038import org.opencms.main.OpenCms; 039import org.opencms.main.OpenCmsServlet; 040import org.opencms.util.CmsStringUtil; 041 042import java.io.BufferedReader; 043import java.io.ByteArrayInputStream; 044import java.io.File; 045import java.io.FileFilter; 046import java.io.IOException; 047import java.io.InputStream; 048import java.io.InputStreamReader; 049import java.util.ArrayList; 050import java.util.LinkedList; 051import java.util.List; 052import java.util.zip.ZipEntry; 053import java.util.zip.ZipInputStream; 054 055import org.apache.commons.logging.Log; 056import org.apache.solr.client.solrj.SolrClient; 057import org.apache.solr.client.solrj.SolrServerException; 058import org.apache.solr.common.SolrInputDocument; 059 060/** 061 * Helping class for manipulating the Solr spellchecker indices. 062 */ 063public final class CmsSpellcheckDictionaryIndexer { 064 065 /** The log object for this class. */ 066 private static final Log LOG = CmsLog.getLog(OpenCmsServlet.class); 067 068 /** The default directory that's holding the dictionary files. */ 069 public static final String DEFAULT_DICTIONARY_DIRECTORY = "/system/workplace/editors/spellcheck"; 070 071 /** A regex pattern that applies to the Solr spellcheck directories. 072 * Matching string example: "spellchecker_en" */ 073 public static final String INDEXES_REGEX = "spellchecker_[a-z]{2}"; 074 075 /** A regex pattern that applies to custom dictionaries. 076 * Matching string example: "custom_dict_en.txt" */ 077 public static final String CUSTOM_DICTIONARY = "custom_dict_[a-z]{2}.txt"; 078 079 /** A regex pattern that applies to the naming of the dictionary files. 080 * Matching string example: "dict_en.txt" */ 081 public static final String DICTIONARY_NAME_REGEX = "dict_[a-z]{2}.txt"; 082 083 /** A regex pattern that applies to the naming of zipped dictionary files. 084 * Matching string example: "dict_en.zip" */ 085 public static final String ZIP_NAME_REGEX = "dict_[a-z]{2}.zip"; 086 087 /** Maximum amount of entries while parsing the dictionary. This variable is needed 088 * in order to prevent OutOfMemoryExceptions while parsing large dictionaries. If you 089 * encounter such exceptions you can adjust its value to a smaller number. */ 090 private static final int MAX_LIST_SIZE = 100000; 091 092 /** 093 * FileFilter implementation that returns only directories whose name matches 094 * the spellchecker indices regex. 095 */ 096 private static final FileFilter SPELLCHECKING_DIRECTORY_NAME_FILTER = new FileFilter() { 097 098 public boolean accept(File f) { 099 100 return f.isDirectory() && f.getName().matches(INDEXES_REGEX); 101 } 102 }; 103 104 /** 105 * Default constructor is private as each method is static. 106 */ 107 private CmsSpellcheckDictionaryIndexer() { 108 109 } 110 111 /** 112 * Adds all dictionaries that are available in the default directory. <p> 113 * 114 * @param client The SolrClient instance object. 115 * @param cms the cms context 116 */ 117 public static void parseAndAddDictionaries(SolrClient client, CmsObject cms) { 118 119 if ((null == client) || (null == cms)) { 120 return; 121 } 122 123 // Set the correct cms context 124 setCmsOfflineProject(cms); 125 126 try { 127 // Get all file resources in the default dictionary directory 128 final List<CmsResource> resources = cms.getResourcesInFolder( 129 DEFAULT_DICTIONARY_DIRECTORY, 130 CmsResourceFilter.DEFAULT_FILES); 131 132 for (final CmsResource resource : resources) { 133 final String resourceName = resource.getName(); 134 // Check if the name of the file matches the dictionary naming scheme 135 String lang = null; 136 if (resourceName.matches(DICTIONARY_NAME_REGEX)) { 137 // Extract the language code that consists of two letters (de, en, es, ...) 138 lang = resourceName.substring(5, 7); 139 } else if (resourceName.matches(CUSTOM_DICTIONARY)) { 140 lang = resourceName.substring(12, 14); 141 } 142 143 if (null != lang) { 144 // Read the file 145 final CmsFile file = cms.readFile(resource); 146 147 // Parse file content and add it to the server 148 final List<SolrInputDocument> documents = new ArrayList<SolrInputDocument>(); 149 150 readAndAddDocumentsFromStream( 151 client, 152 lang, 153 new ByteArrayInputStream(file.getContents()), 154 documents, 155 true); 156 157 // Add and commit the remaining documents to the server 158 addDocuments(client, documents, true); 159 } 160 } 161 162 } catch (CmsException e) { 163 LOG.warn("Could not read from resource. "); 164 } catch (IOException e) { 165 LOG.warn("Could not successfully parse the dictionary. "); 166 } catch (SolrServerException e) { 167 LOG.warn("Exception while adding documents to Solr server. "); 168 } 169 } 170 171 /** 172 * 173 * @param client The SolrClient instance object. 174 * @param cms The OpenCms instance object. 175 */ 176 public static void parseAndAddZippedDictionaries(SolrClient client, CmsObject cms) { 177 178 try { 179 final List<CmsResource> resources = cms.getResourcesInFolder( 180 DEFAULT_DICTIONARY_DIRECTORY, 181 CmsResourceFilter.DEFAULT_FILES); 182 183 // List holding all input documents, regardless of language 184 final List<SolrInputDocument> documents = new LinkedList<SolrInputDocument>(); 185 186 for (CmsResource resource : resources) { 187 final String zipFileName = resource.getName(); 188 if (zipFileName.matches(ZIP_NAME_REGEX)) { 189 final CmsFile cmsFile = cms.readFile(resource); 190 191 // Read zip file content 192 try ( 193 ZipInputStream zipStream = new ZipInputStream(new ByteArrayInputStream(cmsFile.getContents()))) { 194 195 // Holds several entries (files) of the zipfile 196 ZipEntry entry = zipStream.getNextEntry(); 197 198 // Iterate over each files in the zip file 199 while (null != entry) { 200 // Extract name to check if name matches the regex and to guess the 201 // language from the filename 202 final String name = entry.getName(); 203 204 if (name.matches(DICTIONARY_NAME_REGEX)) { 205 206 // The (matching) filename reveals the language 207 final String lang = name.substring(5, 7); 208 209 // Parse and add documents 210 readAndAddDocumentsFromStream(client, lang, zipStream, documents, false); 211 212 // Get the next file in the zip 213 entry = zipStream.getNextEntry(); 214 } 215 216 } 217 } 218 } 219 } 220 221 // Add all documents 222 addDocuments(client, documents, true); 223 } catch (IOException e) { 224 LOG.warn("Failed while reading from " + DEFAULT_DICTIONARY_DIRECTORY + ". "); 225 } catch (CmsException e) { 226 LOG.warn("Failed reading resource " + DEFAULT_DICTIONARY_DIRECTORY + ". "); 227 } catch (SolrServerException e) { 228 LOG.warn("Failed adding documents to Solr server. "); 229 } 230 } 231 232 /** 233 * Checks whether a built of the indices is necessary. 234 * @param cms The appropriate CmsObject instance. 235 * @return true, if the spellcheck indices have to be rebuilt, otherwise false 236 */ 237 public static boolean updatingIndexNecessesary(CmsObject cms) { 238 239 // Set request to the offline project. 240 setCmsOfflineProject(cms); 241 242 // Check whether the spellcheck index directories are empty. 243 // If they are, the index has to be built obviously. 244 if (isSolrSpellcheckIndexDirectoryEmpty()) { 245 return true; 246 } 247 248 // Compare the most recent date of a dictionary with the oldest timestamp 249 // that determines when an index has been built. 250 long dateMostRecentDictionary = getMostRecentDate(cms); 251 long dateOldestIndexWrite = getOldestIndexDate(cms); 252 253 return dateMostRecentDictionary > dateOldestIndexWrite; 254 } 255 256 /** 257 * Add a list of documents to the Solr client.<p> 258 * 259 * @param client The SolrClient instance object. 260 * @param documents The documents that should be added. 261 * @param commit boolean flag indicating whether a "commit" call should be made after adding the documents 262 * 263 * @throws IOException in case something goes wrong 264 * @throws SolrServerException in case something goes wrong 265 */ 266 static void addDocuments(SolrClient client, List<SolrInputDocument> documents, boolean commit) 267 throws IOException, SolrServerException { 268 269 if ((null == client) || (null == documents)) { 270 return; 271 } 272 273 if (!documents.isEmpty()) { 274 client.add(documents); 275 } 276 277 if (commit) { 278 client.commit(); 279 } 280 } 281 282 /** 283 * Deletes all documents from the Solr client.<p> 284 * 285 * @param client The SolrClient instance object. 286 * 287 * @throws IOException in case something goes wrong 288 * @throws SolrServerException in case something goes wrong 289 */ 290 static void deleteAllFiles(SolrClient client) throws IOException, SolrServerException { 291 292 if (null == client) { 293 return; 294 } 295 296 client.deleteByQuery("*:*"); 297 client.commit(); 298 } 299 300 /** 301 * Deletes a single document from the Solr client.<p> 302 * 303 * @param client The SolrClient instance object. 304 * @param lang The affected language. 305 * @param word The word that should be removed. 306 * 307 * @throws IOException in case something goes wrong 308 * @throws SolrServerException in case something goes wrong 309 */ 310 static void deleteDocument(SolrClient client, String lang, String word) throws IOException, SolrServerException { 311 312 if ((null == client) 313 || CmsStringUtil.isEmptyOrWhitespaceOnly(lang) 314 || CmsStringUtil.isEmptyOrWhitespaceOnly(word)) { 315 return; 316 } 317 318 // Make sure the parameter holding the word that should be deleted 319 // contains just a single word 320 if (word.trim().contains(" ")) { 321 final String query = String.format("entry_%s:%s", lang, word); 322 client.deleteByQuery(query); 323 } 324 } 325 326 /** 327 * Determines and returns the timestamp of the most recently modified spellchecker file.<p> 328 * 329 * @param cms the OpenCms instance. 330 * @return timestamp of type long. 331 */ 332 private static long getMostRecentDate(CmsObject cms) { 333 334 long mostRecentDate = Long.MIN_VALUE; 335 336 try { 337 final List<CmsResource> resources = cms.getResourcesInFolder( 338 DEFAULT_DICTIONARY_DIRECTORY, 339 CmsResourceFilter.DEFAULT_FILES); 340 341 for (final CmsResource resource : resources) { 342 final String resourceName = resource.getName(); 343 // Check whether the resource matches the desired patterns 344 if (resourceName.matches(DICTIONARY_NAME_REGEX) 345 || resourceName.matches(ZIP_NAME_REGEX) 346 || resourceName.matches(CUSTOM_DICTIONARY)) { 347 if (resource.getDateLastModified() > mostRecentDate) { 348 mostRecentDate = resource.getDateLastModified(); 349 } 350 } 351 } 352 } catch (CmsException e) { 353 LOG.error("Could not read spellchecker dictionaries. "); 354 } 355 356 return mostRecentDate; 357 } 358 359 /** 360 * Returns the timestamp of the index whose index-built operation lies the 361 * furthest back in the past.<p> 362 * 363 * @param cms the OpenCms instance. 364 * @return timestamp as type long. 365 */ 366 private static long getOldestIndexDate(CmsObject cms) { 367 368 final File path = new File(getSolrSpellcheckRfsPath()); 369 final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER); 370 371 // Initialize with the greatest value a long type can hold 372 long oldestIndexDate = Long.MAX_VALUE; 373 374 for (final File dir : directories) { 375 long date = dir.lastModified(); 376 if (date < oldestIndexDate) { 377 oldestIndexDate = date; 378 } 379 } 380 381 // If no file(s) have been found oldestIndexDate is still holding 382 // Long.MAX_VALUE. In that case return Long.MIN_VALUE to ensure 383 // that no indexing operation takes place. 384 if (Long.MAX_VALUE == oldestIndexDate) { 385 LOG.warn("It appears that no spellcheck indices have been found in " + getSolrSpellcheckRfsPath() + ". "); 386 return Long.MIN_VALUE; 387 } 388 389 return oldestIndexDate; 390 } 391 392 /** 393 * Returns the path in the RFS where the Solr spellcheck files reside. 394 * @return String representation of Solrs spellcheck RFS path. 395 */ 396 private static String getSolrSpellcheckRfsPath() { 397 398 String sPath = OpenCms.getSystemInfo().getWebInfRfsPath(); 399 400 if (!OpenCms.getSystemInfo().getWebInfRfsPath().endsWith(File.separator)) { 401 sPath += File.separator; 402 } 403 404 return sPath + "solr" + File.separator + "spellcheck" + File.separator + "data"; 405 } 406 407 /** 408 * Returns whether the Solr spellchecking index directories are empty 409 * (not initiliazed) or not. 410 * @return true, if the directories contain no indexed data, otherwise false. 411 */ 412 private static boolean isSolrSpellcheckIndexDirectoryEmpty() { 413 414 final File path = new File(getSolrSpellcheckRfsPath()); 415 final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER); 416 417 // Each directory that has been created by Solr but hasn't been indexed yet 418 // contains exactly two files. If there are more files, at least one index has 419 // already been built, so return false in that case. 420 if (directories != null) { 421 for (final File directory : directories) { 422 if (directory.list().length > 2) { 423 return false; 424 } 425 } 426 } 427 return true; 428 } 429 430 /** 431 * Parses the dictionary from an InputStream. 432 * 433 * @param client The SolrClient instance object. 434 * @param lang The language of the dictionary. 435 * @param is The InputStream object. 436 * @param documents List to put the assembled SolrInputObjects into. 437 * @param closeStream boolean flag that determines whether to close the inputstream 438 * or not. 439 */ 440 private static void readAndAddDocumentsFromStream( 441 final SolrClient client, 442 final String lang, 443 final InputStream is, 444 final List<SolrInputDocument> documents, 445 final boolean closeStream) { 446 447 final BufferedReader br = new BufferedReader(new InputStreamReader(is)); 448 449 try { 450 String line = br.readLine(); 451 while (null != line) { 452 453 final SolrInputDocument document = new SolrInputDocument(); 454 // Each field is named after the schema "entry_xx" where xx denotes 455 // the two digit language code. See the file spellcheck/conf/schema.xml. 456 document.addField("entry_" + lang, line); 457 documents.add(document); 458 459 // Prevent OutOfMemoryExceptions ... 460 if (documents.size() >= MAX_LIST_SIZE) { 461 addDocuments(client, documents, false); 462 documents.clear(); 463 } 464 465 line = br.readLine(); 466 } 467 } catch (IOException e) { 468 LOG.error("Could not read spellcheck dictionary from input stream."); 469 } catch (SolrServerException e) { 470 LOG.error("Error while adding documents to Solr server. "); 471 } finally { 472 try { 473 if (closeStream) { 474 br.close(); 475 } 476 } catch (Exception e) { 477 // Nothing to do here anymore .... 478 } 479 } 480 } 481 482 /** 483 * Sets the appropriate OpenCms context. 484 * @param cms The OpenCms instance object. 485 */ 486 private static void setCmsOfflineProject(CmsObject cms) { 487 488 if (null == cms) { 489 return; 490 } 491 492 final CmsRequestContext cmsContext = cms.getRequestContext(); 493 final CmsProject cmsProject = cmsContext.getCurrentProject(); 494 495 if (cmsProject.isOnlineProject()) { 496 CmsProject cmsOfflineProject; 497 try { 498 cmsOfflineProject = cms.readProject("Offline"); 499 cmsContext.setCurrentProject(cmsOfflineProject); 500 } catch (CmsException e) { 501 LOG.warn("Could not set the current project to \"Offline\". "); 502 } 503 } 504 } 505}