001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.documents;
029
030import org.opencms.file.CmsFile;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsResource;
033import org.opencms.file.types.I_CmsResourceType;
034import org.opencms.main.CmsException;
035import org.opencms.main.CmsLog;
036import org.opencms.main.OpenCms;
037import org.opencms.search.I_CmsSearchDocument;
038import org.opencms.search.I_CmsSearchDocument;
039import org.opencms.search.I_CmsSearchIndex;
040import org.opencms.search.extractors.I_CmsExtractionResult;
041
042import java.util.ArrayList;
043import java.util.Iterator;
044import java.util.List;
045
046import org.apache.commons.logging.Log;
047
048/**
049 * Base document factory class for a VFS <code>{@link org.opencms.file.CmsResource}</code>,
050 * just requires a specialized implementation of
051 * <code>{@link I_CmsDocumentFactory#extractContent(CmsObject, CmsResource, I_CmsSearchIndex)}</code>
052 * for text extraction from the binary document content.<p>
053 *
054 * @since 6.0.0
055 */
056public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory {
057
058    /** The log object for this class. */
059    private static final Log LOG = CmsLog.getLog(A_CmsVfsDocument.class);
060
061    /** Name of the document type. */
062    protected String m_name;
063
064    /** The cache used for storing extracted documents. */
065    private CmsExtractionResultCache m_cache;
066
067    /**
068     * Creates a new instance of this lucene document factory.<p>
069     *
070     * @param name name of the documenttype
071     */
072    public A_CmsVfsDocument(String name) {
073
074        m_name = name;
075    }
076
077    /**
078     * Creates a document factory lookup key for the given resource type name / MIME type configuration.<p>
079     *
080     * If the given <code>mimeType</code> is <code>null</code>, this indicates that the key should
081     * match all VFS resource of the given resource type regardless of the MIME type.<p>
082     *
083     * @param type the resource type name to use
084     * @param mimeType the MIME type to use
085     *
086     * @return a document factory lookup key for the given resource id / MIME type configuration
087     */
088    public static String getDocumentKey(String type, String mimeType) {
089
090        StringBuffer result = new StringBuffer(16);
091        result.append(I_CmsSearchDocument.VFS_DOCUMENT_KEY_PREFIX);
092        result.append('_');
093        result.append(type);
094        if (mimeType != null) {
095            result.append(':');
096            result.append(mimeType);
097        }
098        return result.toString();
099    }
100
101    /**
102     * Generates a new lucene document instance from contents of the given resource for the provided index.<p>
103     *
104     * @see org.opencms.search.documents.I_CmsDocumentFactory#createDocument(CmsObject, CmsResource, I_CmsSearchIndex)
105     */
106    public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index)
107    throws CmsException {
108
109        // extract the content from the resource
110        I_CmsExtractionResult content = null;
111
112        if (index.isExtractingContent()) {
113            // do full text content extraction only if required
114
115            // check if caching is enabled for this document type
116            CmsExtractionResultCache cache = getCache();
117            String cacheName = null;
118            if ((cache != null) && (resource.getSiblingCount() > 1)) {
119                // hard drive based caching only makes sense for resources that have siblings,
120                // because the index will also store the content as a blob
121                cacheName = cache.getCacheName(
122                    resource,
123                    isLocaleDependend() ? index.getLocaleForResource(cms, resource, null) : null,
124                    getName());
125                content = cache.getCacheObject(cacheName);
126            }
127
128            if (content == null) {
129                // extraction result has not been found in the cache
130                // use the currently indexed content, if it is still up to date.
131                content = index.getContentIfUnchanged(resource);
132            }
133
134            if (content == null) {
135                // extraction result has not been attached to the resource
136                try {
137                    content = extractContent(cms, resource, index);
138                    if (LOG.isDebugEnabled()) {
139                        LOG.debug("Extracting content for '" + resource.getRootPath() + "' successful.");
140                    }
141                    if ((cache != null) && (resource.getSiblingCount() > 1)) {
142                        // save extracted content to the cache
143                        cache.saveCacheObject(cacheName, content);
144                    }
145                } catch (CmsIndexNoContentException e) {
146                    // there was no content found for the resource
147                    LOG.info(
148                        Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath())
149                            + " "
150                            + e.getMessage());
151                } catch (Throwable e) {
152                    // text extraction failed for document - continue indexing meta information only
153                    LOG.error(
154                        Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()),
155                        e);
156                }
157            }
158        }
159
160        // create the Lucene document according to the index field configuration
161        return index.getFieldConfiguration().createDocument(cms, resource, index, content);
162    }
163
164    /**
165     * @see org.opencms.search.documents.I_CmsDocumentFactory#getCache()
166     */
167    public CmsExtractionResultCache getCache() {
168
169        return m_cache;
170    }
171
172    /**
173     * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List)
174     */
175    public List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException {
176
177        List<String> keys = new ArrayList<String>();
178
179        if (resourceTypes.contains("*")) {
180            List<String> allTypes = new ArrayList<String>();
181            for (Iterator<I_CmsResourceType> i = OpenCms.getResourceManager().getResourceTypes().iterator(); i.hasNext();) {
182                I_CmsResourceType resourceType = i.next();
183                allTypes.add(resourceType.getTypeName());
184            }
185            resourceTypes = allTypes;
186        }
187
188        try {
189            for (Iterator<String> i = resourceTypes.iterator(); i.hasNext();) {
190
191                String typeName = i.next();
192                for (Iterator<String> j = mimeTypes.iterator(); j.hasNext();) {
193                    keys.add(getDocumentKey(typeName, j.next()));
194                }
195                if (mimeTypes.isEmpty()) {
196                    keys.add(getDocumentKey(typeName, null));
197                }
198            }
199        } catch (Exception exc) {
200            throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc);
201        }
202
203        return keys;
204    }
205
206    /**
207     * @see org.opencms.search.documents.I_CmsDocumentFactory#getName()
208     */
209    public String getName() {
210
211        return m_name;
212    }
213
214    /**
215     * @see org.opencms.search.documents.I_CmsDocumentFactory#setCache(org.opencms.search.documents.CmsExtractionResultCache)
216     */
217    public void setCache(CmsExtractionResultCache cache) {
218
219        m_cache = cache;
220    }
221
222    /**
223     * Logs content extraction for the specified resource and index.<p>
224     *
225     * @param resource the resource to log content extraction for
226     * @param index the search index to log content extraction for
227     */
228    protected void logContentExtraction(CmsResource resource, I_CmsSearchIndex index) {
229
230        if (LOG.isDebugEnabled()) {
231            LOG.debug(
232                Messages.get().getBundle().key(
233                    Messages.LOG_EXTRACT_CONTENT_2,
234                    resource.getRootPath(),
235                    index.getName()));
236        }
237    }
238
239    /**
240     * Upgrades the given resource to a {@link CmsFile} with content.<p>
241     *
242     * @param cms the current users OpenCms context
243     * @param resource the resource to upgrade
244     *
245     * @return the given resource upgraded to a {@link CmsFile} with content
246     *
247     * @throws CmsException if the resource could not be read
248     * @throws CmsIndexNoContentException if the resource has no content
249     */
250    protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexNoContentException {
251
252        CmsFile file = cms.readFile(resource);
253        if (file.getLength() <= 0) {
254            throw new CmsIndexNoContentException(
255                Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath()));
256        }
257        return file;
258    }
259}