001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search;
029
030import org.opencms.db.CmsPublishedResource;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsProject;
033import org.opencms.file.CmsResource;
034import org.opencms.file.CmsResourceFilter;
035import org.opencms.main.CmsException;
036import org.opencms.main.CmsLog;
037import org.opencms.report.I_CmsReport;
038import org.opencms.util.CmsUUID;
039
040import java.io.IOException;
041import java.util.ArrayList;
042import java.util.Iterator;
043import java.util.List;
044
045import org.apache.commons.logging.Log;
046
047/**
048 * An indexer indexing {@link CmsResource} based content from the OpenCms VFS.<p>
049 *
050 * @since 6.0.0
051 */
052public class CmsVfsIndexer implements I_CmsIndexer {
053
054    /** The log object for this class. */
055    private static final Log LOG = CmsLog.getLog(CmsVfsIndexer.class);
056
057    // Note: The following member variables must all be "protected" (not "private") since
058    // in case the indexer is extended, the factory method "newInstance()" needs to set them.
059
060    /** The OpenCms user context to use when reading resources from the VFS during indexing. */
061    protected CmsObject m_cms;
062
063    /** The index. */
064    protected I_CmsSearchIndex m_index;
065
066    /** The report. */
067    protected I_CmsReport m_report;
068
069    /**
070     * @see org.opencms.search.I_CmsIndexer#deleteResources(org.opencms.search.I_CmsIndexWriter, java.util.List)
071     */
072    public void deleteResources(I_CmsIndexWriter indexWriter, List<CmsPublishedResource> resourcesToDelete) {
073
074        if ((resourcesToDelete == null) || resourcesToDelete.isEmpty()) {
075            // nothing to delete
076            return;
077        }
078
079        // contains all resources already deleted to avoid multiple deleting in case of siblings
080        List<CmsUUID> resourcesAlreadyDeleted = new ArrayList<CmsUUID>(resourcesToDelete.size());
081
082        Iterator<CmsPublishedResource> i = resourcesToDelete.iterator();
083        while (i.hasNext()) {
084            // iterate all resources in the given list of resources to delete
085            CmsPublishedResource res = i.next();
086            if (!resourcesAlreadyDeleted.contains(res.getStructureId())) {
087                // ensure siblings are only deleted once per update
088                resourcesAlreadyDeleted.add(res.getStructureId());
089                if (!res.isFolder() && !CmsResource.isTemporaryFileName(res.getRootPath())) {
090                    // now delete the resource from the index
091                    deleteResource(indexWriter, res);
092                }
093            }
094        }
095    }
096
097    /**
098     * Returns the OpenCms user context used by this indexer.<p>
099     *
100     * @return the OpenCms user context used by this indexer
101     */
102    public CmsObject getCms() {
103
104        return m_cms;
105    }
106
107    /**
108     * Returns the OpenCms search index updated by this indexer.<p>
109     *
110     * @return the OpenCms search index updated by this indexer
111     */
112    public I_CmsSearchIndex getIndex() {
113
114        return m_index;
115    }
116
117    /**
118     * Returns the report used by this indexer.<p>
119     *
120     * @return the report used by this indexer
121     */
122    public I_CmsReport getReport() {
123
124        return m_report;
125    }
126
127    /**
128     * @see org.opencms.search.I_CmsIndexer#getUpdateData(org.opencms.search.CmsSearchIndexSource, java.util.List)
129     */
130    public CmsSearchIndexUpdateData getUpdateData(
131        CmsSearchIndexSource source,
132        List<CmsPublishedResource> publishedResources) {
133
134        // create a new update collection from this indexer and the given index source
135        CmsSearchIndexUpdateData result = new CmsSearchIndexUpdateData(source, this);
136
137        Iterator<CmsPublishedResource> i = publishedResources.iterator();
138        while (i.hasNext()) {
139            // check all published resources if they match this indexer / source
140            CmsPublishedResource pubRes = i.next();
141            // VFS resources will always have a structure id
142            if (!pubRes.getStructureId().isNullUUID()) {
143                // use utility method from CmsProject to check if published resource is "inside" this index source
144                if (CmsProject.isInsideProject(source.getResourcesNames(), pubRes.getRootPath())) {
145                    // the resource is "inside" this index source
146                    addResourceToUpdateData(pubRes, result);
147                }
148            }
149        }
150        return result;
151    }
152
153    /**
154     * The default indexer is not able to resolve locale dependencies between documents.<p>
155     *
156     * @see org.opencms.search.I_CmsIndexer#isLocaleDependenciesEnable()
157     */
158    public boolean isLocaleDependenciesEnable() {
159
160        return false;
161    }
162
163    /**
164     * @see org.opencms.search.I_CmsIndexer#newInstance(org.opencms.file.CmsObject, org.opencms.report.I_CmsReport, org.opencms.search.I_CmsSearchIndex)
165     */
166    public I_CmsIndexer newInstance(CmsObject cms, I_CmsReport report, I_CmsSearchIndex index) {
167
168        CmsVfsIndexer indexer = null;
169        try {
170            indexer = getClass().newInstance();
171            indexer.m_cms = cms;
172            indexer.m_report = report;
173            indexer.m_index = index;
174        } catch (Exception e) {
175            LOG.error(
176                Messages.get().getBundle().key(
177                    Messages.ERR_INDEXSOURCE_INDEXER_CLASS_NAME_2,
178                    getClass().getName(),
179                    CmsVfsIndexer.class),
180                e);
181        }
182        return indexer;
183    }
184
185    /**
186     * @see org.opencms.search.I_CmsIndexer#rebuildIndex(org.opencms.search.I_CmsIndexWriter, org.opencms.search.CmsIndexingThreadManager, org.opencms.search.CmsSearchIndexSource)
187     */
188    public void rebuildIndex(
189        I_CmsIndexWriter writer,
190        CmsIndexingThreadManager threadManager,
191        CmsSearchIndexSource source)
192    throws CmsIndexException {
193
194        List<String> resourceNames = source.getResourcesNames();
195        Iterator<String> i = resourceNames.iterator();
196        while (i.hasNext()) {
197            // read the resources from all configured source folders
198            String resourceName = i.next();
199            List<CmsResource> resources = null;
200            try {
201                // read all resources (only files) below the given path
202                resources = m_cms.readResources(resourceName, CmsResourceFilter.IGNORE_EXPIRATION.addRequireFile());
203            } catch (CmsException e) {
204                if (m_report != null) {
205                    m_report.println(
206                        Messages.get().container(
207                            Messages.RPT_UNABLE_TO_READ_SOURCE_2,
208                            resourceName,
209                            e.getLocalizedMessage()),
210                        I_CmsReport.FORMAT_WARNING);
211                }
212                if (LOG.isWarnEnabled()) {
213                    LOG.warn(
214                        Messages.get().getBundle().key(
215                            Messages.LOG_UNABLE_TO_READ_SOURCE_2,
216                            resourceName,
217                            m_index.getName()),
218                        e);
219                }
220            }
221            if (resources != null) {
222                // iterate all resources found in the folder
223                Iterator<CmsResource> j = resources.iterator();
224                while (j.hasNext()) {
225                    // now update all the resources individually
226                    CmsResource resource = j.next();
227                    updateResource(writer, threadManager, resource);
228                }
229            }
230        }
231    }
232
233    /**
234     * @see org.opencms.search.I_CmsIndexer#updateResources(org.opencms.search.I_CmsIndexWriter, org.opencms.search.CmsIndexingThreadManager, java.util.List)
235     */
236    public void updateResources(
237        I_CmsIndexWriter writer,
238        CmsIndexingThreadManager threadManager,
239        List<CmsPublishedResource> resourcesToUpdate)
240    throws CmsIndexException {
241
242        if ((resourcesToUpdate == null) || resourcesToUpdate.isEmpty()) {
243            // nothing to update
244            return;
245        }
246
247        // contains all resources already updated to avoid multiple updates in case of siblings
248        List<String> resourcesAlreadyUpdated = new ArrayList<String>(resourcesToUpdate.size());
249
250        // index all resources that are in the given list
251        Iterator<CmsPublishedResource> i = resourcesToUpdate.iterator();
252        while (i.hasNext()) {
253            CmsPublishedResource res = i.next();
254            CmsResource resource = null;
255            if (!CmsResource.isTemporaryFileName(res.getRootPath())) {
256                try {
257                    resource = m_cms.readResource(res.getRootPath(), CmsResourceFilter.IGNORE_EXPIRATION);
258                } catch (CmsException e) {
259                    if (LOG.isWarnEnabled()) {
260                        LOG.warn(
261                            Messages.get().getBundle().key(
262                                Messages.LOG_UNABLE_TO_READ_RESOURCE_2,
263                                res.getRootPath(),
264                                m_index.getName()),
265                            e);
266                    }
267                }
268
269                if (resource != null) {
270                    if (!resourcesAlreadyUpdated.contains(resource.getRootPath())) {
271                        // ensure resources are only indexed once per update
272                        resourcesAlreadyUpdated.add(resource.getRootPath());
273                        updateResource(writer, threadManager, resource);
274                    }
275                }
276            }
277        }
278    }
279
280    /**
281     * Adds a given published resource to the provided search index update data.<p>
282     *
283     * This method decides if the resource has to be included in the "update" or "delete" list.<p>
284     *
285     * @param pubRes the published resource to add
286     * @param updateData the search index update data to add the resource to
287     */
288    protected void addResourceToUpdateData(CmsPublishedResource pubRes, CmsSearchIndexUpdateData updateData) {
289
290        if (pubRes.getState().isDeleted()) {
291            // deleted resource just needs to be removed
292            updateData.addResourceToDelete(pubRes);
293        } else if (pubRes.getState().isNew() || pubRes.getState().isChanged() || pubRes.getState().isUnchanged()) {
294            updateData.addResourceToUpdate(pubRes);
295        }
296    }
297
298    /**
299     * Deletes a resource with the given index writer.<p>
300     *
301     * @param indexWriter the index writer to resource the resource with
302     * @param resource the root path of the resource to delete
303     */
304    protected void deleteResource(I_CmsIndexWriter indexWriter, CmsPublishedResource resource) {
305
306        try {
307            if (LOG.isInfoEnabled()) {
308                LOG.info(Messages.get().getBundle().key(Messages.LOG_DELETING_FROM_INDEX_1, resource.getRootPath()));
309            }
310            // delete all documents with this term from the index
311            indexWriter.deleteDocument(resource);
312        } catch (IOException e) {
313            if (LOG.isWarnEnabled()) {
314                LOG.warn(
315                    Messages.get().getBundle().key(
316                        Messages.LOG_IO_INDEX_DOCUMENT_DELETE_2,
317                        resource.getRootPath(),
318                        m_index.getName()),
319                    e);
320            }
321        }
322    }
323
324    /**
325     * Checks if the published resource is inside the time window set with release and expiration date.<p>
326     *
327     * @param resource the published resource to check
328     * @return true if the published resource is inside the time window, otherwise false
329     */
330    protected boolean isResourceInTimeWindow(CmsPublishedResource resource) {
331
332        return m_cms.existsResource(
333            m_cms.getRequestContext().removeSiteRoot(resource.getRootPath()),
334            CmsResourceFilter.DEFAULT);
335    }
336
337    /**
338     * Updates (writes) a single resource in the index.<p>
339     *
340     * @param writer the index writer to use
341     * @param threadManager the thread manager to use when extracting the document text
342     * @param resource the resource to update
343     *
344     * @throws CmsIndexException if something goes wrong
345     */
346    protected void updateResource(I_CmsIndexWriter writer, CmsIndexingThreadManager threadManager, CmsResource resource)
347    throws CmsIndexException {
348
349        if (resource.isFolder() || resource.isTemporaryFile()) {
350            // don't ever index folders or temporary files
351            return;
352        }
353        try {
354            // create the index thread for the resource
355            threadManager.createIndexingThread(this, writer, resource);
356        } catch (Exception e) {
357
358            if (m_report != null) {
359                m_report.println(
360                    Messages.get().container(Messages.RPT_SEARCH_INDEXING_FAILED_0),
361                    I_CmsReport.FORMAT_WARNING);
362            }
363            if (LOG.isWarnEnabled()) {
364                LOG.warn(
365                    Messages.get().getBundle().key(
366                        Messages.ERR_INDEX_RESOURCE_FAILED_2,
367                        resource.getRootPath(),
368                        m_index.getName()),
369                    e);
370            }
371            throw new CmsIndexException(
372                Messages.get().container(
373                    Messages.ERR_INDEX_RESOURCE_FAILED_2,
374                    resource.getRootPath(),
375                    m_index.getName()));
376        }
377    }
378
379    /**
380     * Updates a resource with the given index writer and the new document provided.<p>
381     *
382     * @param indexWriter the index writer to update the resource with
383     * @param rootPath the root path of the resource to update
384     * @param doc the new document for the resource
385     */
386    protected void updateResource(I_CmsIndexWriter indexWriter, String rootPath, I_CmsSearchDocument doc) {
387
388        try {
389            indexWriter.updateDocument(rootPath, doc);
390        } catch (Exception e) {
391            if (LOG.isWarnEnabled()) {
392                LOG.warn(
393                    Messages.get().getBundle().key(
394                        Messages.LOG_IO_INDEX_DOCUMENT_UPDATE_2,
395                        rootPath,
396                        m_index.getName()),
397                    e);
398            }
399        }
400    }
401}