001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.documents; 029 030import org.opencms.file.CmsFile; 031import org.opencms.file.CmsObject; 032import org.opencms.file.CmsResource; 033import org.opencms.file.types.I_CmsResourceType; 034import org.opencms.main.CmsException; 035import org.opencms.main.CmsLog; 036import org.opencms.main.OpenCms; 037import org.opencms.search.I_CmsSearchDocument; 038import org.opencms.search.I_CmsSearchDocument; 039import org.opencms.search.I_CmsSearchIndex; 040import org.opencms.search.extractors.I_CmsExtractionResult; 041 042import java.util.ArrayList; 043import java.util.Iterator; 044import java.util.List; 045 046import org.apache.commons.logging.Log; 047 048/** 049 * Base document factory class for a VFS <code>{@link org.opencms.file.CmsResource}</code>, 050 * just requires a specialized implementation of 051 * <code>{@link I_CmsDocumentFactory#extractContent(CmsObject, CmsResource, I_CmsSearchIndex)}</code> 052 * for text extraction from the binary document content.<p> 053 * 054 * @since 6.0.0 055 */ 056public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory { 057 058 /** The log object for this class. */ 059 private static final Log LOG = CmsLog.getLog(A_CmsVfsDocument.class); 060 061 /** Name of the document type. */ 062 protected String m_name; 063 064 /** The cache used for storing extracted documents. */ 065 private CmsExtractionResultCache m_cache; 066 067 /** 068 * Creates a new instance of this lucene document factory.<p> 069 * 070 * @param name name of the documenttype 071 */ 072 public A_CmsVfsDocument(String name) { 073 074 m_name = name; 075 } 076 077 /** 078 * Creates a document factory lookup key for the given resource type name / MIME type configuration.<p> 079 * 080 * If the given <code>mimeType</code> is <code>null</code>, this indicates that the key should 081 * match all VFS resource of the given resource type regardless of the MIME type.<p> 082 * 083 * @param type the resource type name to use 084 * @param mimeType the MIME type to use 085 * 086 * @return a document factory lookup key for the given resource id / MIME type configuration 087 */ 088 public static String getDocumentKey(String type, String mimeType) { 089 090 StringBuffer result = new StringBuffer(16); 091 result.append(I_CmsSearchDocument.VFS_DOCUMENT_KEY_PREFIX); 092 result.append('_'); 093 result.append(type); 094 if (mimeType != null) { 095 result.append(':'); 096 result.append(mimeType); 097 } 098 return result.toString(); 099 } 100 101 /** 102 * Generates a new lucene document instance from contents of the given resource for the provided index.<p> 103 * 104 * @see org.opencms.search.documents.I_CmsDocumentFactory#createDocument(CmsObject, CmsResource, I_CmsSearchIndex) 105 */ 106 public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index) 107 throws CmsException { 108 109 // extract the content from the resource 110 I_CmsExtractionResult content = null; 111 112 if (index.isExtractingContent()) { 113 // do full text content extraction only if required 114 115 // check if caching is enabled for this document type 116 CmsExtractionResultCache cache = getCache(); 117 String cacheName = null; 118 if ((cache != null) && (resource.getSiblingCount() > 1)) { 119 // hard drive based caching only makes sense for resources that have siblings, 120 // because the index will also store the content as a blob 121 cacheName = cache.getCacheName( 122 resource, 123 isLocaleDependend() ? index.getLocaleForResource(cms, resource, null) : null, 124 getName()); 125 content = cache.getCacheObject(cacheName); 126 } 127 128 if (content == null) { 129 // extraction result has not been found in the cache 130 // use the currently indexed content, if it is still up to date. 131 content = index.getContentIfUnchanged(resource); 132 } 133 134 if (content == null) { 135 // extraction result has not been attached to the resource 136 try { 137 content = extractContent(cms, resource, index); 138 if (LOG.isDebugEnabled()) { 139 LOG.debug("Extracting content for '" + resource.getRootPath() + "' successful."); 140 } 141 if ((cache != null) && (resource.getSiblingCount() > 1)) { 142 // save extracted content to the cache 143 cache.saveCacheObject(cacheName, content); 144 } 145 } catch (CmsIndexNoContentException e) { 146 // there was no content found for the resource 147 LOG.info( 148 Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()) 149 + " " 150 + e.getMessage()); 151 } catch (Throwable e) { 152 // text extraction failed for document - continue indexing meta information only 153 LOG.error( 154 Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), 155 e); 156 } 157 } 158 } 159 160 // create the Lucene document according to the index field configuration 161 return index.getFieldConfiguration().createDocument(cms, resource, index, content); 162 } 163 164 /** 165 * @see org.opencms.search.documents.I_CmsDocumentFactory#getCache() 166 */ 167 public CmsExtractionResultCache getCache() { 168 169 return m_cache; 170 } 171 172 /** 173 * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List) 174 */ 175 public List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException { 176 177 List<String> keys = new ArrayList<String>(); 178 179 if (resourceTypes.contains("*")) { 180 List<String> allTypes = new ArrayList<String>(); 181 for (Iterator<I_CmsResourceType> i = OpenCms.getResourceManager().getResourceTypes().iterator(); i.hasNext();) { 182 I_CmsResourceType resourceType = i.next(); 183 allTypes.add(resourceType.getTypeName()); 184 } 185 resourceTypes = allTypes; 186 } 187 188 try { 189 for (Iterator<String> i = resourceTypes.iterator(); i.hasNext();) { 190 191 String typeName = i.next(); 192 for (Iterator<String> j = mimeTypes.iterator(); j.hasNext();) { 193 keys.add(getDocumentKey(typeName, j.next())); 194 } 195 if (mimeTypes.isEmpty()) { 196 keys.add(getDocumentKey(typeName, null)); 197 } 198 } 199 } catch (Exception exc) { 200 throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc); 201 } 202 203 return keys; 204 } 205 206 /** 207 * @see org.opencms.search.documents.I_CmsDocumentFactory#getName() 208 */ 209 public String getName() { 210 211 return m_name; 212 } 213 214 /** 215 * @see org.opencms.search.documents.I_CmsDocumentFactory#setCache(org.opencms.search.documents.CmsExtractionResultCache) 216 */ 217 public void setCache(CmsExtractionResultCache cache) { 218 219 m_cache = cache; 220 } 221 222 /** 223 * Logs content extraction for the specified resource and index.<p> 224 * 225 * @param resource the resource to log content extraction for 226 * @param index the search index to log content extraction for 227 */ 228 protected void logContentExtraction(CmsResource resource, I_CmsSearchIndex index) { 229 230 if (LOG.isDebugEnabled()) { 231 LOG.debug( 232 Messages.get().getBundle().key( 233 Messages.LOG_EXTRACT_CONTENT_2, 234 resource.getRootPath(), 235 index.getName())); 236 } 237 } 238 239 /** 240 * Upgrades the given resource to a {@link CmsFile} with content.<p> 241 * 242 * @param cms the current users OpenCms context 243 * @param resource the resource to upgrade 244 * 245 * @return the given resource upgraded to a {@link CmsFile} with content 246 * 247 * @throws CmsException if the resource could not be read 248 * @throws CmsIndexNoContentException if the resource has no content 249 */ 250 protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexNoContentException { 251 252 CmsFile file = cms.readFile(resource); 253 if (file.getLength() <= 0) { 254 throw new CmsIndexNoContentException( 255 Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath())); 256 } 257 return file; 258 } 259}