001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.solr.spellchecking; 029 030import org.opencms.file.CmsFile; 031import org.opencms.file.CmsObject; 032import org.opencms.file.CmsProject; 033import org.opencms.file.CmsRequestContext; 034import org.opencms.file.CmsResource; 035import org.opencms.file.CmsResourceFilter; 036import org.opencms.main.CmsException; 037import org.opencms.main.CmsLog; 038import org.opencms.main.OpenCms; 039import org.opencms.main.OpenCmsServlet; 040import org.opencms.util.CmsStringUtil; 041 042import java.io.BufferedReader; 043import java.io.ByteArrayInputStream; 044import java.io.File; 045import java.io.FileFilter; 046import java.io.IOException; 047import java.io.InputStream; 048import java.io.InputStreamReader; 049import java.util.ArrayList; 050import java.util.LinkedList; 051import java.util.List; 052import java.util.zip.ZipEntry; 053import java.util.zip.ZipInputStream; 054 055import org.apache.commons.logging.Log; 056import org.apache.solr.client.solrj.SolrClient; 057import org.apache.solr.client.solrj.SolrServerException; 058import org.apache.solr.common.SolrInputDocument; 059 060/** 061 * Helping class for manipulating the Solr spellchecker indices. 062 */ 063public final class CmsSpellcheckDictionaryIndexer { 064 065 /** The log object for this class. */ 066 private static final Log LOG = CmsLog.getLog(OpenCmsServlet.class); 067 068 /** The default directory that's holding the dictionary files. */ 069 public static final String DEFAULT_DICTIONARY_DIRECTORY = "/system/workplace/editors/spellcheck"; 070 071 /** A regex pattern that applies to the Solr spellcheck directories. 072 * Matching string example: "spellchecker_en" */ 073 public static final String INDEXES_REGEX = "spellchecker_[a-z]{2}"; 074 075 /** A regex pattern that applies to custom dictionaries. 076 * Matching string example: "custom_dict_en.txt" */ 077 public static final String CUSTOM_DICTIONARY = "custom_dict_[a-z]{2}.txt"; 078 079 /** A regex pattern that applies to the naming of the dictionary files. 080 * Matching string example: "dict_en.txt" */ 081 public static final String DICTIONARY_NAME_REGEX = "dict_[a-z]{2}.txt"; 082 083 /** A regex pattern that applies to the naming of zipped dictionary files. 084 * Matching string example: "dict_en.zip" */ 085 public static final String ZIP_NAME_REGEX = "dict_[a-z]{2}.zip"; 086 087 /** Maximum amount of entries while parsing the dictionary. This variable is needed 088 * in order to prevent OutOfMemoryExceptions while parsing large dictionaries. If you 089 * encounter such exceptions you can adjust its value to a smaller number. */ 090 private static final int MAX_LIST_SIZE = 100000; 091 092 /** 093 * FileFilter implementation that returns only directories whose name matches 094 * the spellchecker indices regex. 095 */ 096 private static final FileFilter SPELLCHECKING_DIRECTORY_NAME_FILTER = new FileFilter() { 097 098 public boolean accept(File f) { 099 100 return f.isDirectory() && f.getName().matches(INDEXES_REGEX); 101 } 102 }; 103 104 /** 105 * Default constructor is private as each method is static. 106 */ 107 private CmsSpellcheckDictionaryIndexer() { 108 109 } 110 111 /** 112 * Adds all dictionaries that are available in the default directory. <p> 113 * 114 * @param client The SolrClient instance object. 115 * @param cms the cms context 116 */ 117 public static void parseAndAddDictionaries(SolrClient client, CmsObject cms) { 118 119 if ((null == client) || (null == cms)) { 120 return; 121 } 122 123 // Set the correct cms context 124 setCmsOfflineProject(cms); 125 126 try { 127 // Get all file resources in the default dictionary directory 128 final List<CmsResource> resources = cms.getResourcesInFolder( 129 DEFAULT_DICTIONARY_DIRECTORY, 130 CmsResourceFilter.DEFAULT_FILES); 131 132 for (final CmsResource resource : resources) { 133 final String resourceName = resource.getName(); 134 // Check if the name of the file matches the dictionary naming scheme 135 String lang = null; 136 if (resourceName.matches(DICTIONARY_NAME_REGEX)) { 137 // Extract the language code that consists of two letters (de, en, es, ...) 138 lang = resourceName.substring(5, 7); 139 } else if (resourceName.matches(CUSTOM_DICTIONARY)) { 140 lang = resourceName.substring(12, 14); 141 } 142 143 if (null != lang) { 144 // Read the file 145 final CmsFile file = cms.readFile(resource); 146 147 // Parse file content and add it to the server 148 final List<SolrInputDocument> documents = new ArrayList<SolrInputDocument>(); 149 150 readAndAddDocumentsFromStream( 151 client, 152 lang, 153 new ByteArrayInputStream(file.getContents()), 154 documents, 155 true); 156 157 // Add and commit the remaining documents to the server 158 addDocuments(client, documents, true); 159 } 160 } 161 162 } catch (CmsException e) { 163 LOG.warn("Could not read from resource. "); 164 } catch (IOException e) { 165 LOG.warn("Could not successfully parse the dictionary. "); 166 } catch (SolrServerException e) { 167 LOG.warn("Exception while adding documents to Solr server. "); 168 } 169 } 170 171 /** 172 * 173 * @param client The SolrClient instance object. 174 * @param cms The OpenCms instance object. 175 */ 176 public static void parseAndAddZippedDictionaries(SolrClient client, CmsObject cms) { 177 178 try { 179 final List<CmsResource> resources = cms.getResourcesInFolder( 180 DEFAULT_DICTIONARY_DIRECTORY, 181 CmsResourceFilter.DEFAULT_FILES); 182 183 // List holding all input documents, regardless of language 184 final List<SolrInputDocument> documents = new LinkedList<SolrInputDocument>(); 185 186 for (CmsResource resource : resources) { 187 final String zipFileName = resource.getName(); 188 if (zipFileName.matches(ZIP_NAME_REGEX)) { 189 final CmsFile cmsFile = cms.readFile(resource); 190 191 // Read zip file content 192 final ZipInputStream zipStream = new ZipInputStream( 193 new ByteArrayInputStream(cmsFile.getContents())); 194 195 // Holds several entries (files) of the zipfile 196 ZipEntry entry = zipStream.getNextEntry(); 197 198 // Iterate over each files in the zip file 199 while (null != entry) { 200 // Extract name to check if name matches the regex and to guess the 201 // language from the filename 202 final String name = entry.getName(); 203 204 if (name.matches(DICTIONARY_NAME_REGEX)) { 205 206 // The (matching) filename reveals the language 207 final String lang = name.substring(5, 7); 208 209 // Parse and add documents 210 readAndAddDocumentsFromStream(client, lang, zipStream, documents, false); 211 212 // Get the next file in the zip 213 entry = zipStream.getNextEntry(); 214 } 215 216 } 217 } 218 } 219 220 // Add all documents 221 addDocuments(client, documents, true); 222 } catch (IOException e) { 223 LOG.warn("Failed while reading from " + DEFAULT_DICTIONARY_DIRECTORY + ". "); 224 } catch (CmsException e) { 225 LOG.warn("Failed reading resource " + DEFAULT_DICTIONARY_DIRECTORY + ". "); 226 } catch (SolrServerException e) { 227 LOG.warn("Failed adding documents to Solr server. "); 228 } 229 } 230 231 /** 232 * Checks whether a built of the indices is necessary. 233 * @param cms The appropriate CmsObject instance. 234 * @return true, if the spellcheck indices have to be rebuilt, otherwise false 235 */ 236 public static boolean updatingIndexNecessesary(CmsObject cms) { 237 238 // Set request to the offline project. 239 setCmsOfflineProject(cms); 240 241 // Check whether the spellcheck index directories are empty. 242 // If they are, the index has to be built obviously. 243 if (isSolrSpellcheckIndexDirectoryEmpty()) { 244 return true; 245 } 246 247 // Compare the most recent date of a dictionary with the oldest timestamp 248 // that determines when an index has been built. 249 long dateMostRecentDictionary = getMostRecentDate(cms); 250 long dateOldestIndexWrite = getOldestIndexDate(cms); 251 252 return dateMostRecentDictionary > dateOldestIndexWrite; 253 } 254 255 /** 256 * Add a list of documents to the Solr client.<p> 257 * 258 * @param client The SolrClient instance object. 259 * @param documents The documents that should be added. 260 * @param commit boolean flag indicating whether a "commit" call should be made after adding the documents 261 * 262 * @throws IOException in case something goes wrong 263 * @throws SolrServerException in case something goes wrong 264 */ 265 static void addDocuments(SolrClient client, List<SolrInputDocument> documents, boolean commit) 266 throws IOException, SolrServerException { 267 268 if ((null == client) || (null == documents)) { 269 return; 270 } 271 272 if (!documents.isEmpty()) { 273 client.add(documents); 274 } 275 276 if (commit) { 277 client.commit(); 278 } 279 } 280 281 /** 282 * Deletes all documents from the Solr client.<p> 283 * 284 * @param client The SolrClient instance object. 285 * 286 * @throws IOException in case something goes wrong 287 * @throws SolrServerException in case something goes wrong 288 */ 289 static void deleteAllFiles(SolrClient client) throws IOException, SolrServerException { 290 291 if (null == client) { 292 return; 293 } 294 295 client.deleteByQuery("*:*"); 296 client.commit(); 297 } 298 299 /** 300 * Deletes a single document from the Solr client.<p> 301 * 302 * @param client The SolrClient instance object. 303 * @param lang The affected language. 304 * @param word The word that should be removed. 305 * 306 * @throws IOException in case something goes wrong 307 * @throws SolrServerException in case something goes wrong 308 */ 309 static void deleteDocument(SolrClient client, String lang, String word) throws IOException, SolrServerException { 310 311 if ((null == client) 312 || CmsStringUtil.isEmptyOrWhitespaceOnly(lang) 313 || CmsStringUtil.isEmptyOrWhitespaceOnly(word)) { 314 return; 315 } 316 317 // Make sure the parameter holding the word that should be deleted 318 // contains just a single word 319 if (word.trim().contains(" ")) { 320 final String query = String.format("entry_%s:%s", lang, word); 321 client.deleteByQuery(query); 322 } 323 } 324 325 /** 326 * Determines and returns the timestamp of the most recently modified spellchecker file.<p> 327 * 328 * @param cms the OpenCms instance. 329 * @return timestamp of type long. 330 */ 331 private static long getMostRecentDate(CmsObject cms) { 332 333 long mostRecentDate = Long.MIN_VALUE; 334 335 try { 336 final List<CmsResource> resources = cms.getResourcesInFolder( 337 DEFAULT_DICTIONARY_DIRECTORY, 338 CmsResourceFilter.DEFAULT_FILES); 339 340 for (final CmsResource resource : resources) { 341 final String resourceName = resource.getName(); 342 // Check whether the resource matches the desired patterns 343 if (resourceName.matches(DICTIONARY_NAME_REGEX) 344 || resourceName.matches(ZIP_NAME_REGEX) 345 || resourceName.matches(CUSTOM_DICTIONARY)) { 346 if (resource.getDateLastModified() > mostRecentDate) { 347 mostRecentDate = resource.getDateLastModified(); 348 } 349 } 350 } 351 } catch (CmsException e) { 352 LOG.error("Could not read spellchecker dictionaries. "); 353 } 354 355 return mostRecentDate; 356 } 357 358 /** 359 * Returns the timestamp of the index whose index-built operation lies the 360 * furthest back in the past.<p> 361 * 362 * @param cms the OpenCms instance. 363 * @return timestamp as type long. 364 */ 365 private static long getOldestIndexDate(CmsObject cms) { 366 367 final File path = new File(getSolrSpellcheckRfsPath()); 368 final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER); 369 370 // Initialize with the greatest value a long type can hold 371 long oldestIndexDate = Long.MAX_VALUE; 372 373 for (final File dir : directories) { 374 long date = dir.lastModified(); 375 if (date < oldestIndexDate) { 376 oldestIndexDate = date; 377 } 378 } 379 380 // If no file(s) have been found oldestIndexDate is still holding 381 // Long.MAX_VALUE. In that case return Long.MIN_VALUE to ensure 382 // that no indexing operation takes place. 383 if (Long.MAX_VALUE == oldestIndexDate) { 384 LOG.warn("It appears that no spellcheck indices have been found in " + getSolrSpellcheckRfsPath() + ". "); 385 return Long.MIN_VALUE; 386 } 387 388 return oldestIndexDate; 389 } 390 391 /** 392 * Returns the path in the RFS where the Solr spellcheck files reside. 393 * @return String representation of Solrs spellcheck RFS path. 394 */ 395 private static String getSolrSpellcheckRfsPath() { 396 397 String sPath = OpenCms.getSystemInfo().getWebInfRfsPath(); 398 399 if (!OpenCms.getSystemInfo().getWebInfRfsPath().endsWith(File.separator)) { 400 sPath += File.separator; 401 } 402 403 return sPath + "solr" + File.separator + "spellcheck" + File.separator + "data"; 404 } 405 406 /** 407 * Returns whether the Solr spellchecking index directories are empty 408 * (not initiliazed) or not. 409 * @return true, if the directories contain no indexed data, otherwise false. 410 */ 411 private static boolean isSolrSpellcheckIndexDirectoryEmpty() { 412 413 final File path = new File(getSolrSpellcheckRfsPath()); 414 final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER); 415 416 // Each directory that has been created by Solr but hasn't been indexed yet 417 // contains exactly two files. If there are more files, at least one index has 418 // already been built, so return false in that case. 419 if (directories != null) { 420 for (final File directory : directories) { 421 if (directory.list().length > 2) { 422 return false; 423 } 424 } 425 } 426 return true; 427 } 428 429 /** 430 * Parses the dictionary from an InputStream. 431 * 432 * @param client The SolrClient instance object. 433 * @param lang The language of the dictionary. 434 * @param is The InputStream object. 435 * @param documents List to put the assembled SolrInputObjects into. 436 * @param closeStream boolean flag that determines whether to close the inputstream 437 * or not. 438 */ 439 private static void readAndAddDocumentsFromStream( 440 final SolrClient client, 441 final String lang, 442 final InputStream is, 443 final List<SolrInputDocument> documents, 444 final boolean closeStream) { 445 446 final BufferedReader br = new BufferedReader(new InputStreamReader(is)); 447 448 try { 449 String line = br.readLine(); 450 while (null != line) { 451 452 final SolrInputDocument document = new SolrInputDocument(); 453 // Each field is named after the schema "entry_xx" where xx denotes 454 // the two digit language code. See the file spellcheck/conf/schema.xml. 455 document.addField("entry_" + lang, line); 456 documents.add(document); 457 458 // Prevent OutOfMemoryExceptions ... 459 if (documents.size() >= MAX_LIST_SIZE) { 460 addDocuments(client, documents, false); 461 documents.clear(); 462 } 463 464 line = br.readLine(); 465 } 466 } catch (IOException e) { 467 LOG.error("Could not read spellcheck dictionary from input stream."); 468 } catch (SolrServerException e) { 469 LOG.error("Error while adding documents to Solr server. "); 470 } finally { 471 try { 472 if (closeStream) { 473 br.close(); 474 } 475 } catch (Exception e) { 476 // Nothing to do here anymore .... 477 } 478 } 479 } 480 481 /** 482 * Sets the appropriate OpenCms context. 483 * @param cms The OpenCms instance object. 484 */ 485 private static void setCmsOfflineProject(CmsObject cms) { 486 487 if (null == cms) { 488 return; 489 } 490 491 final CmsRequestContext cmsContext = cms.getRequestContext(); 492 final CmsProject cmsProject = cmsContext.getCurrentProject(); 493 494 if (cmsProject.isOnlineProject()) { 495 CmsProject cmsOfflineProject; 496 try { 497 cmsOfflineProject = cms.readProject("Offline"); 498 cmsContext.setCurrentProject(cmsOfflineProject); 499 } catch (CmsException e) { 500 LOG.warn("Could not set the current project to \"Offline\". "); 501 } 502 } 503 } 504}