001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.solr.spellchecking;
029
030import org.opencms.file.CmsFile;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsProject;
033import org.opencms.file.CmsRequestContext;
034import org.opencms.file.CmsResource;
035import org.opencms.file.CmsResourceFilter;
036import org.opencms.main.CmsException;
037import org.opencms.main.CmsLog;
038import org.opencms.main.OpenCms;
039import org.opencms.main.OpenCmsServlet;
040import org.opencms.util.CmsStringUtil;
041
042import java.io.BufferedReader;
043import java.io.ByteArrayInputStream;
044import java.io.File;
045import java.io.FileFilter;
046import java.io.IOException;
047import java.io.InputStream;
048import java.io.InputStreamReader;
049import java.util.ArrayList;
050import java.util.LinkedList;
051import java.util.List;
052import java.util.zip.ZipEntry;
053import java.util.zip.ZipInputStream;
054
055import org.apache.commons.logging.Log;
056import org.apache.solr.client.solrj.SolrClient;
057import org.apache.solr.client.solrj.SolrServerException;
058import org.apache.solr.common.SolrInputDocument;
059
060/**
061 * Helping class for manipulating the Solr spellchecker indices.
062 */
063public final class CmsSpellcheckDictionaryIndexer {
064
065    /** The log object for this class. */
066    private static final Log LOG = CmsLog.getLog(OpenCmsServlet.class);
067
068    /** The default directory that's holding the dictionary files. */
069    public static final String DEFAULT_DICTIONARY_DIRECTORY = "/system/workplace/editors/spellcheck";
070
071    /** A regex pattern that applies to the Solr spellcheck directories.
072     * Matching string example: "spellchecker_en" */
073    public static final String INDEXES_REGEX = "spellchecker_[a-z]{2}";
074
075    /** A regex pattern that applies to custom dictionaries.
076     * Matching string example: "custom_dict_en.txt" */
077    public static final String CUSTOM_DICTIONARY = "custom_dict_[a-z]{2}.txt";
078
079    /** A regex pattern that applies to the naming of the dictionary files.
080     * Matching string example: "dict_en.txt" */
081    public static final String DICTIONARY_NAME_REGEX = "dict_[a-z]{2}.txt";
082
083    /** A regex pattern that applies to the naming of zipped dictionary files.
084     * Matching string example: "dict_en.zip" */
085    public static final String ZIP_NAME_REGEX = "dict_[a-z]{2}.zip";
086
087    /** Maximum amount of entries while parsing the dictionary. This variable is needed
088     * in order to prevent OutOfMemoryExceptions while parsing large dictionaries. If you
089     * encounter such exceptions you can adjust its value to a smaller number. */
090    private static final int MAX_LIST_SIZE = 100000;
091
092    /**
093     * FileFilter implementation that returns only directories whose name matches
094     * the spellchecker indices regex.
095     */
096    private static final FileFilter SPELLCHECKING_DIRECTORY_NAME_FILTER = new FileFilter() {
097
098        public boolean accept(File f) {
099
100            return f.isDirectory() && f.getName().matches(INDEXES_REGEX);
101        }
102    };
103
104    /**
105     * Default constructor is private as each method is static.
106     */
107    private CmsSpellcheckDictionaryIndexer() {
108
109    }
110
111    /**
112     * Adds all dictionaries that are available in the default directory. <p>
113     *
114     * @param client The SolrClient instance object.
115     * @param cms the cms context
116     */
117    public static void parseAndAddDictionaries(SolrClient client, CmsObject cms) {
118
119        if ((null == client) || (null == cms)) {
120            return;
121        }
122
123        // Set the correct cms context
124        setCmsOfflineProject(cms);
125
126        try {
127            // Get all file resources in the default dictionary directory
128            final List<CmsResource> resources = cms.getResourcesInFolder(
129                DEFAULT_DICTIONARY_DIRECTORY,
130                CmsResourceFilter.DEFAULT_FILES);
131
132            for (final CmsResource resource : resources) {
133                final String resourceName = resource.getName();
134                // Check if the name of the file matches the dictionary naming scheme
135                String lang = null;
136                if (resourceName.matches(DICTIONARY_NAME_REGEX)) {
137                    // Extract the language code that consists of two letters (de, en, es, ...)
138                    lang = resourceName.substring(5, 7);
139                } else if (resourceName.matches(CUSTOM_DICTIONARY)) {
140                    lang = resourceName.substring(12, 14);
141                }
142
143                if (null != lang) {
144                    // Read the file
145                    final CmsFile file = cms.readFile(resource);
146
147                    // Parse file content and add it to the server
148                    final List<SolrInputDocument> documents = new ArrayList<SolrInputDocument>();
149
150                    readAndAddDocumentsFromStream(
151                        client,
152                        lang,
153                        new ByteArrayInputStream(file.getContents()),
154                        documents,
155                        true);
156
157                    // Add and commit the remaining documents to the server
158                    addDocuments(client, documents, true);
159                }
160            }
161
162        } catch (CmsException e) {
163            LOG.warn("Could not read from resource. ");
164        } catch (IOException e) {
165            LOG.warn("Could not successfully parse the dictionary. ");
166        } catch (SolrServerException e) {
167            LOG.warn("Exception while adding documents to Solr server. ");
168        }
169    }
170
171    /**
172     *
173     * @param client The SolrClient instance object.
174     * @param cms The OpenCms instance object.
175     */
176    public static void parseAndAddZippedDictionaries(SolrClient client, CmsObject cms) {
177
178        try {
179            final List<CmsResource> resources = cms.getResourcesInFolder(
180                DEFAULT_DICTIONARY_DIRECTORY,
181                CmsResourceFilter.DEFAULT_FILES);
182
183            // List holding all input documents, regardless of language
184            final List<SolrInputDocument> documents = new LinkedList<SolrInputDocument>();
185
186            for (CmsResource resource : resources) {
187                final String zipFileName = resource.getName();
188                if (zipFileName.matches(ZIP_NAME_REGEX)) {
189                    final CmsFile cmsFile = cms.readFile(resource);
190
191                    // Read zip file content
192                    final ZipInputStream zipStream = new ZipInputStream(
193                        new ByteArrayInputStream(cmsFile.getContents()));
194
195                    // Holds several entries (files) of the zipfile
196                    ZipEntry entry = zipStream.getNextEntry();
197
198                    // Iterate over each files in the zip file
199                    while (null != entry) {
200                        // Extract name to check if name matches the regex and to guess the
201                        // language from the filename
202                        final String name = entry.getName();
203
204                        if (name.matches(DICTIONARY_NAME_REGEX)) {
205
206                            // The (matching) filename reveals the language
207                            final String lang = name.substring(5, 7);
208
209                            // Parse and add documents
210                            readAndAddDocumentsFromStream(client, lang, zipStream, documents, false);
211
212                            // Get the next file in the zip
213                            entry = zipStream.getNextEntry();
214                        }
215
216                    }
217                }
218            }
219
220            // Add all documents
221            addDocuments(client, documents, true);
222        } catch (IOException e) {
223            LOG.warn("Failed while reading from " + DEFAULT_DICTIONARY_DIRECTORY + ". ");
224        } catch (CmsException e) {
225            LOG.warn("Failed reading resource " + DEFAULT_DICTIONARY_DIRECTORY + ". ");
226        } catch (SolrServerException e) {
227            LOG.warn("Failed adding documents to Solr server. ");
228        }
229    }
230
231    /**
232     * Checks whether a built of the indices is necessary.
233     * @param cms The appropriate CmsObject instance.
234     * @return true, if the spellcheck indices have to be rebuilt, otherwise false
235     */
236    public static boolean updatingIndexNecessesary(CmsObject cms) {
237
238        // Set request to the offline project.
239        setCmsOfflineProject(cms);
240
241        // Check whether the spellcheck index directories are empty.
242        // If they are, the index has to be built obviously.
243        if (isSolrSpellcheckIndexDirectoryEmpty()) {
244            return true;
245        }
246
247        // Compare the most recent date of a dictionary with the oldest timestamp
248        // that determines when an index has been built.
249        long dateMostRecentDictionary = getMostRecentDate(cms);
250        long dateOldestIndexWrite = getOldestIndexDate(cms);
251
252        return dateMostRecentDictionary > dateOldestIndexWrite;
253    }
254
255    /**
256     * Add a list of documents to the Solr client.<p>
257     *
258     * @param client The SolrClient instance object.
259     * @param documents The documents that should be added.
260     * @param commit boolean flag indicating whether a "commit" call should be made after adding the documents
261     *
262     * @throws IOException in case something goes wrong
263     * @throws SolrServerException in case something goes wrong
264     */
265    static void addDocuments(SolrClient client, List<SolrInputDocument> documents, boolean commit)
266    throws IOException, SolrServerException {
267
268        if ((null == client) || (null == documents)) {
269            return;
270        }
271
272        if (!documents.isEmpty()) {
273            client.add(documents);
274        }
275
276        if (commit) {
277            client.commit();
278        }
279    }
280
281    /**
282     * Deletes all documents from the Solr client.<p>
283     *
284     * @param client The SolrClient instance object.
285     *
286     * @throws IOException in case something goes wrong
287     * @throws SolrServerException in case something goes wrong
288     */
289    static void deleteAllFiles(SolrClient client) throws IOException, SolrServerException {
290
291        if (null == client) {
292            return;
293        }
294
295        client.deleteByQuery("*:*");
296        client.commit();
297    }
298
299    /**
300     * Deletes a single document from the Solr client.<p>
301     *
302     * @param client The SolrClient instance object.
303     * @param lang The affected language.
304     * @param word The word that should be removed.
305     *
306     * @throws IOException in case something goes wrong
307     * @throws SolrServerException in case something goes wrong
308     */
309    static void deleteDocument(SolrClient client, String lang, String word) throws IOException, SolrServerException {
310
311        if ((null == client)
312            || CmsStringUtil.isEmptyOrWhitespaceOnly(lang)
313            || CmsStringUtil.isEmptyOrWhitespaceOnly(word)) {
314            return;
315        }
316
317        // Make sure the parameter holding the word that should be deleted
318        // contains just a single word
319        if (word.trim().contains(" ")) {
320            final String query = String.format("entry_%s:%s", lang, word);
321            client.deleteByQuery(query);
322        }
323    }
324
325    /**
326     * Determines and returns the timestamp of the most recently modified spellchecker file.<p>
327     *
328     * @param cms the OpenCms instance.
329     * @return timestamp of type long.
330     */
331    private static long getMostRecentDate(CmsObject cms) {
332
333        long mostRecentDate = Long.MIN_VALUE;
334
335        try {
336            final List<CmsResource> resources = cms.getResourcesInFolder(
337                DEFAULT_DICTIONARY_DIRECTORY,
338                CmsResourceFilter.DEFAULT_FILES);
339
340            for (final CmsResource resource : resources) {
341                final String resourceName = resource.getName();
342                // Check whether the resource matches the desired patterns
343                if (resourceName.matches(DICTIONARY_NAME_REGEX)
344                    || resourceName.matches(ZIP_NAME_REGEX)
345                    || resourceName.matches(CUSTOM_DICTIONARY)) {
346                    if (resource.getDateLastModified() > mostRecentDate) {
347                        mostRecentDate = resource.getDateLastModified();
348                    }
349                }
350            }
351        } catch (CmsException e) {
352            LOG.error("Could not read spellchecker dictionaries. ");
353        }
354
355        return mostRecentDate;
356    }
357
358    /**
359     * Returns the timestamp of the index whose index-built operation lies the
360     * furthest back in the past.<p>
361     *
362     * @param cms the OpenCms instance.
363     * @return timestamp as type long.
364     */
365    private static long getOldestIndexDate(CmsObject cms) {
366
367        final File path = new File(getSolrSpellcheckRfsPath());
368        final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER);
369
370        // Initialize with the greatest value a long type can hold
371        long oldestIndexDate = Long.MAX_VALUE;
372
373        for (final File dir : directories) {
374            long date = dir.lastModified();
375            if (date < oldestIndexDate) {
376                oldestIndexDate = date;
377            }
378        }
379
380        // If no file(s) have been found oldestIndexDate is still holding
381        // Long.MAX_VALUE. In that case return Long.MIN_VALUE to ensure
382        // that no indexing operation takes place.
383        if (Long.MAX_VALUE == oldestIndexDate) {
384            LOG.warn("It appears that no spellcheck indices have been found in " + getSolrSpellcheckRfsPath() + ". ");
385            return Long.MIN_VALUE;
386        }
387
388        return oldestIndexDate;
389    }
390
391    /**
392     * Returns the path in the RFS where the Solr spellcheck files reside.
393     * @return String representation of Solrs spellcheck RFS path.
394     */
395    private static String getSolrSpellcheckRfsPath() {
396
397        String sPath = OpenCms.getSystemInfo().getWebInfRfsPath();
398
399        if (!OpenCms.getSystemInfo().getWebInfRfsPath().endsWith(File.separator)) {
400            sPath += File.separator;
401        }
402
403        return sPath + "solr" + File.separator + "spellcheck" + File.separator + "data";
404    }
405
406    /**
407     * Returns whether the Solr spellchecking index directories are empty
408     * (not initiliazed) or not.
409     * @return true, if the directories contain no indexed data, otherwise false.
410     */
411    private static boolean isSolrSpellcheckIndexDirectoryEmpty() {
412
413        final File path = new File(getSolrSpellcheckRfsPath());
414        final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER);
415
416        // Each directory that has been created by Solr but hasn't been indexed yet
417        // contains exactly two files. If there are more files, at least one index has
418        // already been built, so return false in that case.
419        if (directories != null) {
420            for (final File directory : directories) {
421                if (directory.list().length > 2) {
422                    return false;
423                }
424            }
425        }
426        return true;
427    }
428
429    /**
430     * Parses the dictionary from an InputStream.
431     *
432     * @param client The SolrClient instance object.
433     * @param lang The language of the dictionary.
434     * @param is The InputStream object.
435     * @param documents List to put the assembled SolrInputObjects into.
436     * @param closeStream boolean flag that determines whether to close the inputstream
437     * or not.
438     */
439    private static void readAndAddDocumentsFromStream(
440        final SolrClient client,
441        final String lang,
442        final InputStream is,
443        final List<SolrInputDocument> documents,
444        final boolean closeStream) {
445
446        final BufferedReader br = new BufferedReader(new InputStreamReader(is));
447
448        try {
449            String line = br.readLine();
450            while (null != line) {
451
452                final SolrInputDocument document = new SolrInputDocument();
453                // Each field is named after the schema "entry_xx" where xx denotes
454                // the two digit language code. See the file spellcheck/conf/schema.xml.
455                document.addField("entry_" + lang, line);
456                documents.add(document);
457
458                // Prevent OutOfMemoryExceptions ...
459                if (documents.size() >= MAX_LIST_SIZE) {
460                    addDocuments(client, documents, false);
461                    documents.clear();
462                }
463
464                line = br.readLine();
465            }
466        } catch (IOException e) {
467            LOG.error("Could not read spellcheck dictionary from input stream.");
468        } catch (SolrServerException e) {
469            LOG.error("Error while adding documents to Solr server. ");
470        } finally {
471            try {
472                if (closeStream) {
473                    br.close();
474                }
475            } catch (Exception e) {
476                // Nothing to do here anymore ....
477            }
478        }
479    }
480
481    /**
482     * Sets the appropriate OpenCms context.
483     * @param cms The OpenCms instance object.
484     */
485    private static void setCmsOfflineProject(CmsObject cms) {
486
487        if (null == cms) {
488            return;
489        }
490
491        final CmsRequestContext cmsContext = cms.getRequestContext();
492        final CmsProject cmsProject = cmsContext.getCurrentProject();
493
494        if (cmsProject.isOnlineProject()) {
495            CmsProject cmsOfflineProject;
496            try {
497                cmsOfflineProject = cms.readProject("Offline");
498                cmsContext.setCurrentProject(cmsOfflineProject);
499            } catch (CmsException e) {
500                LOG.warn("Could not set the current project to \"Offline\". ");
501            }
502        }
503    }
504}