001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.documents; 029 030import org.opencms.file.CmsFile; 031import org.opencms.file.CmsObject; 032import org.opencms.file.CmsResource; 033import org.opencms.file.types.CmsResourceTypeXmlContent; 034import org.opencms.file.types.I_CmsResourceType; 035import org.opencms.main.CmsException; 036import org.opencms.main.OpenCms; 037import org.opencms.search.CmsIndexException; 038import org.opencms.search.I_CmsSearchDocument; 039import org.opencms.search.I_CmsSearchIndex; 040import org.opencms.search.extractors.CmsExtractionResult; 041import org.opencms.search.extractors.I_CmsExtractionResult; 042import org.opencms.util.CmsStringUtil; 043import org.opencms.xml.A_CmsXmlDocument; 044import org.opencms.xml.CmsXmlContentDefinition; 045import org.opencms.xml.content.CmsXmlContentFactory; 046import org.opencms.xml.content.I_CmsXmlContentHandler; 047import org.opencms.xml.types.I_CmsXmlContentValue; 048 049import java.util.ArrayList; 050import java.util.Iterator; 051import java.util.LinkedHashMap; 052import java.util.List; 053import java.util.Locale; 054 055/** 056 * Lucene document factory class to extract index data from an OpenCms VFS resource 057 * of type <code>CmsResourceTypeXmlContent</code>.<p> 058 * 059 * All XML nodes from the content for all locales will be stored separately in the item map 060 * which you can access using {@link CmsExtractionResult#getContentItems()}. The XML elements will be 061 * accessible using their xpath. The xpath will have the form like for example 062 * <code>Text[1]</code> or <code>Nested[1]/Text[1]</code>.<p> 063 * 064 * @since 6.0.0 065 */ 066public class CmsDocumentXmlContent extends A_CmsVfsDocument { 067 068 /** 069 * Creates a new instance of this lucene document factory.<p> 070 * 071 * @param name name of the document type 072 */ 073 public CmsDocumentXmlContent(String name) { 074 075 super(name); 076 } 077 078 /** 079 * 080 * @see org.opencms.search.documents.A_CmsVfsDocument#createDocument(org.opencms.file.CmsObject, org.opencms.file.CmsResource, org.opencms.search.I_CmsSearchIndex) 081 */ 082 @Override 083 public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index) 084 throws CmsException { 085 086 CmsXmlContentDefinition def = CmsXmlContentDefinition.getContentDefinitionForResource(cms, resource); 087 if (def.getContentHandler().isContainerPageOnly()) { 088 return null; 089 } 090 return super.createDocument(cms, resource, index); 091 } 092 093 /** 094 * Returns the raw text content of a given VFS resource of type <code>CmsResourceTypeXmlContent</code>.<p> 095 * 096 * All XML nodes from the content for all locales will be stored separately in the item map 097 * which you can access using {@link CmsExtractionResult#getContentItems()}. The XML elements will be 098 * accessible using their xpath. The xpath will have the form like for example 099 * <code>Text[1]</code> or <code>Nested[1]/Text[1]</code>.<p> 100 * 101 * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, I_CmsSearchIndex) 102 */ 103 public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, I_CmsSearchIndex index) 104 throws CmsException { 105 106 logContentExtraction(resource, index); 107 try { 108 CmsFile file = readFile(cms, resource); 109 A_CmsXmlDocument xmlContent = CmsXmlContentFactory.unmarshal(cms, file); 110 I_CmsXmlContentHandler handler = xmlContent.getHandler(); 111 Locale locale = index.getLocaleForResource(cms, resource, xmlContent.getLocales()); 112 List<String> elements = xmlContent.getNames(locale); 113 StringBuffer content = new StringBuffer(); 114 LinkedHashMap<String, String> items = new LinkedHashMap<String, String>(); 115 for (Iterator<String> i = elements.iterator(); i.hasNext();) { 116 String xpath = i.next(); 117 // xpath will have the form "Text[1]" or "Nested[1]/Text[1]" 118 I_CmsXmlContentValue value = xmlContent.getValue(xpath, locale); 119 if (handler.isSearchable(value)) { 120 // the content value is searchable 121 String extracted = value.getPlainText(cms); 122 if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(extracted)) { 123 items.put(xpath, extracted); 124 content.append(extracted); 125 content.append('\n'); 126 } 127 } 128 } 129 return new CmsExtractionResult(content.toString(), items); 130 } catch (Exception e) { 131 throw new CmsIndexException( 132 Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), 133 e); 134 } 135 } 136 137 /** 138 * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List) 139 */ 140 @Override 141 public List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException { 142 143 if (resourceTypes.contains("*")) { 144 // we need to find all configured XML content types 145 List<String> allTypes = new ArrayList<String>(); 146 for (Iterator<I_CmsResourceType> i = OpenCms.getResourceManager().getResourceTypes().iterator(); i.hasNext();) { 147 I_CmsResourceType resourceType = i.next(); 148 if ((resourceType instanceof CmsResourceTypeXmlContent) 149 // either we need a configured schema, or another class name (which must then contain an inline schema) 150 && (((CmsResourceTypeXmlContent)resourceType).getConfiguration().containsKey( 151 CmsResourceTypeXmlContent.CONFIGURATION_SCHEMA) 152 || !CmsResourceTypeXmlContent.class.equals(resourceType.getClass()))) { 153 // add the XML content resource type name 154 allTypes.add(resourceType.getTypeName()); 155 } 156 } 157 resourceTypes = allTypes; 158 } 159 160 return super.getDocumentKeys(resourceTypes, mimeTypes); 161 } 162 163 /** 164 * @see org.opencms.search.documents.I_CmsDocumentFactory#isLocaleDependend() 165 */ 166 public boolean isLocaleDependend() { 167 168 return true; 169 } 170 171 /** 172 * @see org.opencms.search.documents.I_CmsDocumentFactory#isUsingCache() 173 */ 174 public boolean isUsingCache() { 175 176 return true; 177 } 178}