001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.solr.spellchecking;
029
030import org.opencms.file.CmsFile;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsProject;
033import org.opencms.file.CmsRequestContext;
034import org.opencms.file.CmsResource;
035import org.opencms.file.CmsResourceFilter;
036import org.opencms.main.CmsException;
037import org.opencms.main.CmsLog;
038import org.opencms.main.OpenCms;
039import org.opencms.main.OpenCmsServlet;
040import org.opencms.util.CmsStringUtil;
041
042import java.io.BufferedReader;
043import java.io.ByteArrayInputStream;
044import java.io.File;
045import java.io.FileFilter;
046import java.io.IOException;
047import java.io.InputStream;
048import java.io.InputStreamReader;
049import java.util.ArrayList;
050import java.util.LinkedList;
051import java.util.List;
052import java.util.zip.ZipEntry;
053import java.util.zip.ZipInputStream;
054
055import org.apache.commons.logging.Log;
056import org.apache.solr.client.solrj.SolrClient;
057import org.apache.solr.client.solrj.SolrServerException;
058import org.apache.solr.common.SolrInputDocument;
059
060/**
061 * Helping class for manipulating the Solr spellchecker indices.
062 */
063public final class CmsSpellcheckDictionaryIndexer {
064
065    /** The log object for this class. */
066    private static final Log LOG = CmsLog.getLog(OpenCmsServlet.class);
067
068    /** The default directory that's holding the dictionary files. */
069    public static final String DEFAULT_DICTIONARY_DIRECTORY = "/system/workplace/editors/spellcheck";
070
071    /** A regex pattern that applies to the Solr spellcheck directories.
072     * Matching string example: "spellchecker_en" */
073    public static final String INDEXES_REGEX = "spellchecker_[a-z]{2}";
074
075    /** A regex pattern that applies to custom dictionaries.
076     * Matching string example: "custom_dict_en.txt" */
077    public static final String CUSTOM_DICTIONARY = "custom_dict_[a-z]{2}.txt";
078
079    /** A regex pattern that applies to the naming of the dictionary files.
080     * Matching string example: "dict_en.txt" */
081    public static final String DICTIONARY_NAME_REGEX = "dict_[a-z]{2}.txt";
082
083    /** A regex pattern that applies to the naming of zipped dictionary files.
084     * Matching string example: "dict_en.zip" */
085    public static final String ZIP_NAME_REGEX = "dict_[a-z]{2}.zip";
086
087    /** Maximum amount of entries while parsing the dictionary. This variable is needed
088     * in order to prevent OutOfMemoryExceptions while parsing large dictionaries. If you
089     * encounter such exceptions you can adjust its value to a smaller number. */
090    private static final int MAX_LIST_SIZE = 100000;
091
092    /**
093     * FileFilter implementation that returns only directories whose name matches
094     * the spellchecker indices regex.
095     */
096    private static final FileFilter SPELLCHECKING_DIRECTORY_NAME_FILTER = new FileFilter() {
097
098        public boolean accept(File f) {
099
100            return f.isDirectory() && f.getName().matches(INDEXES_REGEX);
101        }
102    };
103
104    /**
105     * Default constructor is private as each method is static.
106     */
107    private CmsSpellcheckDictionaryIndexer() {
108
109    }
110
111    /**
112     * Adds all dictionaries that are available in the default directory. <p>
113     *
114     * @param client The SolrClient instance object.
115     * @param cms the cms context
116     */
117    public static void parseAndAddDictionaries(SolrClient client, CmsObject cms) {
118
119        if ((null == client) || (null == cms)) {
120            return;
121        }
122
123        // Set the correct cms context
124        setCmsOfflineProject(cms);
125
126        try {
127            // Get all file resources in the default dictionary directory
128            final List<CmsResource> resources = cms.getResourcesInFolder(
129                DEFAULT_DICTIONARY_DIRECTORY,
130                CmsResourceFilter.DEFAULT_FILES);
131
132            for (final CmsResource resource : resources) {
133                final String resourceName = resource.getName();
134                // Check if the name of the file matches the dictionary naming scheme
135                String lang = null;
136                if (resourceName.matches(DICTIONARY_NAME_REGEX)) {
137                    // Extract the language code that consists of two letters (de, en, es, ...)
138                    lang = resourceName.substring(5, 7);
139                } else if (resourceName.matches(CUSTOM_DICTIONARY)) {
140                    lang = resourceName.substring(12, 14);
141                }
142
143                if (null != lang) {
144                    // Read the file
145                    final CmsFile file = cms.readFile(resource);
146
147                    // Parse file content and add it to the server
148                    final List<SolrInputDocument> documents = new ArrayList<SolrInputDocument>();
149
150                    readAndAddDocumentsFromStream(
151                        client,
152                        lang,
153                        new ByteArrayInputStream(file.getContents()),
154                        documents,
155                        true);
156
157                    // Add and commit the remaining documents to the server
158                    addDocuments(client, documents, true);
159                }
160            }
161
162        } catch (CmsException e) {
163            LOG.warn("Could not read from resource. ");
164        } catch (IOException e) {
165            LOG.warn("Could not successfully parse the dictionary. ");
166        } catch (SolrServerException e) {
167            LOG.warn("Exception while adding documents to Solr server. ");
168        }
169    }
170
171    /**
172     *
173     * @param client The SolrClient instance object.
174     * @param cms The OpenCms instance object.
175     */
176    public static void parseAndAddZippedDictionaries(SolrClient client, CmsObject cms) {
177
178        try {
179            final List<CmsResource> resources = cms.getResourcesInFolder(
180                DEFAULT_DICTIONARY_DIRECTORY,
181                CmsResourceFilter.DEFAULT_FILES);
182
183            // List holding all input documents, regardless of language
184            final List<SolrInputDocument> documents = new LinkedList<SolrInputDocument>();
185
186            for (CmsResource resource : resources) {
187                final String zipFileName = resource.getName();
188                if (zipFileName.matches(ZIP_NAME_REGEX)) {
189                    final CmsFile cmsFile = cms.readFile(resource);
190
191                    // Read zip file content
192                    try (
193                    ZipInputStream zipStream = new ZipInputStream(new ByteArrayInputStream(cmsFile.getContents()))) {
194
195                        // Holds several entries (files) of the zipfile
196                        ZipEntry entry = zipStream.getNextEntry();
197
198                        // Iterate over each files in the zip file
199                        while (null != entry) {
200                            // Extract name to check if name matches the regex and to guess the
201                            // language from the filename
202                            final String name = entry.getName();
203
204                            if (name.matches(DICTIONARY_NAME_REGEX)) {
205
206                                // The (matching) filename reveals the language
207                                final String lang = name.substring(5, 7);
208
209                                // Parse and add documents
210                                readAndAddDocumentsFromStream(client, lang, zipStream, documents, false);
211
212                                // Get the next file in the zip
213                                entry = zipStream.getNextEntry();
214                            }
215
216                        }
217                    }
218                }
219            }
220
221            // Add all documents
222            addDocuments(client, documents, true);
223        } catch (IOException e) {
224            LOG.warn("Failed while reading from " + DEFAULT_DICTIONARY_DIRECTORY + ". ");
225        } catch (CmsException e) {
226            LOG.warn("Failed reading resource " + DEFAULT_DICTIONARY_DIRECTORY + ". ");
227        } catch (SolrServerException e) {
228            LOG.warn("Failed adding documents to Solr server. ");
229        }
230    }
231
232    /**
233     * Checks whether a built of the indices is necessary.
234     * @param cms The appropriate CmsObject instance.
235     * @return true, if the spellcheck indices have to be rebuilt, otherwise false
236     */
237    public static boolean updatingIndexNecessesary(CmsObject cms) {
238
239        // Set request to the offline project.
240        setCmsOfflineProject(cms);
241
242        // Check whether the spellcheck index directories are empty.
243        // If they are, the index has to be built obviously.
244        if (isSolrSpellcheckIndexDirectoryEmpty()) {
245            return true;
246        }
247
248        // Compare the most recent date of a dictionary with the oldest timestamp
249        // that determines when an index has been built.
250        long dateMostRecentDictionary = getMostRecentDate(cms);
251        long dateOldestIndexWrite = getOldestIndexDate(cms);
252
253        return dateMostRecentDictionary > dateOldestIndexWrite;
254    }
255
256    /**
257     * Add a list of documents to the Solr client.<p>
258     *
259     * @param client The SolrClient instance object.
260     * @param documents The documents that should be added.
261     * @param commit boolean flag indicating whether a "commit" call should be made after adding the documents
262     *
263     * @throws IOException in case something goes wrong
264     * @throws SolrServerException in case something goes wrong
265     */
266    static void addDocuments(SolrClient client, List<SolrInputDocument> documents, boolean commit)
267    throws IOException, SolrServerException {
268
269        if ((null == client) || (null == documents)) {
270            return;
271        }
272
273        if (!documents.isEmpty()) {
274            client.add(documents);
275        }
276
277        if (commit) {
278            client.commit();
279        }
280    }
281
282    /**
283     * Deletes all documents from the Solr client.<p>
284     *
285     * @param client The SolrClient instance object.
286     *
287     * @throws IOException in case something goes wrong
288     * @throws SolrServerException in case something goes wrong
289     */
290    static void deleteAllFiles(SolrClient client) throws IOException, SolrServerException {
291
292        if (null == client) {
293            return;
294        }
295
296        client.deleteByQuery("*:*");
297        client.commit();
298    }
299
300    /**
301     * Deletes a single document from the Solr client.<p>
302     *
303     * @param client The SolrClient instance object.
304     * @param lang The affected language.
305     * @param word The word that should be removed.
306     *
307     * @throws IOException in case something goes wrong
308     * @throws SolrServerException in case something goes wrong
309     */
310    static void deleteDocument(SolrClient client, String lang, String word) throws IOException, SolrServerException {
311
312        if ((null == client)
313            || CmsStringUtil.isEmptyOrWhitespaceOnly(lang)
314            || CmsStringUtil.isEmptyOrWhitespaceOnly(word)) {
315            return;
316        }
317
318        // Make sure the parameter holding the word that should be deleted
319        // contains just a single word
320        if (word.trim().contains(" ")) {
321            final String query = String.format("entry_%s:%s", lang, word);
322            client.deleteByQuery(query);
323        }
324    }
325
326    /**
327     * Determines and returns the timestamp of the most recently modified spellchecker file.<p>
328     *
329     * @param cms the OpenCms instance.
330     * @return timestamp of type long.
331     */
332    private static long getMostRecentDate(CmsObject cms) {
333
334        long mostRecentDate = Long.MIN_VALUE;
335
336        try {
337            final List<CmsResource> resources = cms.getResourcesInFolder(
338                DEFAULT_DICTIONARY_DIRECTORY,
339                CmsResourceFilter.DEFAULT_FILES);
340
341            for (final CmsResource resource : resources) {
342                final String resourceName = resource.getName();
343                // Check whether the resource matches the desired patterns
344                if (resourceName.matches(DICTIONARY_NAME_REGEX)
345                    || resourceName.matches(ZIP_NAME_REGEX)
346                    || resourceName.matches(CUSTOM_DICTIONARY)) {
347                    if (resource.getDateLastModified() > mostRecentDate) {
348                        mostRecentDate = resource.getDateLastModified();
349                    }
350                }
351            }
352        } catch (CmsException e) {
353            LOG.error("Could not read spellchecker dictionaries. ");
354        }
355
356        return mostRecentDate;
357    }
358
359    /**
360     * Returns the timestamp of the index whose index-built operation lies the
361     * furthest back in the past.<p>
362     *
363     * @param cms the OpenCms instance.
364     * @return timestamp as type long.
365     */
366    private static long getOldestIndexDate(CmsObject cms) {
367
368        final File path = new File(getSolrSpellcheckRfsPath());
369        final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER);
370
371        // Initialize with the greatest value a long type can hold
372        long oldestIndexDate = Long.MAX_VALUE;
373
374        for (final File dir : directories) {
375            long date = dir.lastModified();
376            if (date < oldestIndexDate) {
377                oldestIndexDate = date;
378            }
379        }
380
381        // If no file(s) have been found oldestIndexDate is still holding
382        // Long.MAX_VALUE. In that case return Long.MIN_VALUE to ensure
383        // that no indexing operation takes place.
384        if (Long.MAX_VALUE == oldestIndexDate) {
385            LOG.warn("It appears that no spellcheck indices have been found in " + getSolrSpellcheckRfsPath() + ". ");
386            return Long.MIN_VALUE;
387        }
388
389        return oldestIndexDate;
390    }
391
392    /**
393     * Returns the path in the RFS where the Solr spellcheck files reside.
394     * @return String representation of Solrs spellcheck RFS path.
395     */
396    private static String getSolrSpellcheckRfsPath() {
397
398        String sPath = OpenCms.getSystemInfo().getWebInfRfsPath();
399
400        if (!OpenCms.getSystemInfo().getWebInfRfsPath().endsWith(File.separator)) {
401            sPath += File.separator;
402        }
403
404        return sPath + "solr" + File.separator + "spellcheck" + File.separator + "data";
405    }
406
407    /**
408     * Returns whether the Solr spellchecking index directories are empty
409     * (not initiliazed) or not.
410     * @return true, if the directories contain no indexed data, otherwise false.
411     */
412    private static boolean isSolrSpellcheckIndexDirectoryEmpty() {
413
414        final File path = new File(getSolrSpellcheckRfsPath());
415        final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER);
416
417        // Each directory that has been created by Solr but hasn't been indexed yet
418        // contains exactly two files. If there are more files, at least one index has
419        // already been built, so return false in that case.
420        if (directories != null) {
421            for (final File directory : directories) {
422                if (directory.list().length > 2) {
423                    return false;
424                }
425            }
426        }
427        return true;
428    }
429
430    /**
431     * Parses the dictionary from an InputStream.
432     *
433     * @param client The SolrClient instance object.
434     * @param lang The language of the dictionary.
435     * @param is The InputStream object.
436     * @param documents List to put the assembled SolrInputObjects into.
437     * @param closeStream boolean flag that determines whether to close the inputstream
438     * or not.
439     */
440    private static void readAndAddDocumentsFromStream(
441        final SolrClient client,
442        final String lang,
443        final InputStream is,
444        final List<SolrInputDocument> documents,
445        final boolean closeStream) {
446
447        final BufferedReader br = new BufferedReader(new InputStreamReader(is));
448
449        try {
450            String line = br.readLine();
451            while (null != line) {
452
453                final SolrInputDocument document = new SolrInputDocument();
454                // Each field is named after the schema "entry_xx" where xx denotes
455                // the two digit language code. See the file spellcheck/conf/schema.xml.
456                document.addField("entry_" + lang, line);
457                documents.add(document);
458
459                // Prevent OutOfMemoryExceptions ...
460                if (documents.size() >= MAX_LIST_SIZE) {
461                    addDocuments(client, documents, false);
462                    documents.clear();
463                }
464
465                line = br.readLine();
466            }
467        } catch (IOException e) {
468            LOG.error("Could not read spellcheck dictionary from input stream.");
469        } catch (SolrServerException e) {
470            LOG.error("Error while adding documents to Solr server. ");
471        } finally {
472            try {
473                if (closeStream) {
474                    br.close();
475                }
476            } catch (Exception e) {
477                // Nothing to do here anymore ....
478            }
479        }
480    }
481
482    /**
483     * Sets the appropriate OpenCms context.
484     * @param cms The OpenCms instance object.
485     */
486    private static void setCmsOfflineProject(CmsObject cms) {
487
488        if (null == cms) {
489            return;
490        }
491
492        final CmsRequestContext cmsContext = cms.getRequestContext();
493        final CmsProject cmsProject = cmsContext.getCurrentProject();
494
495        if (cmsProject.isOnlineProject()) {
496            CmsProject cmsOfflineProject;
497            try {
498                cmsOfflineProject = cms.readProject("Offline");
499                cmsContext.setCurrentProject(cmsOfflineProject);
500            } catch (CmsException e) {
501                LOG.warn("Could not set the current project to \"Offline\". ");
502            }
503        }
504    }
505}