001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.documents;
029
030import org.opencms.ade.configuration.CmsADEConfigData;
031import org.opencms.file.CmsFile;
032import org.opencms.file.CmsObject;
033import org.opencms.file.CmsResource;
034import org.opencms.main.CmsException;
035import org.opencms.main.CmsLog;
036import org.opencms.main.OpenCms;
037import org.opencms.search.CmsIndexException;
038import org.opencms.search.I_CmsSearchDocument;
039import org.opencms.search.I_CmsSearchIndex;
040import org.opencms.search.extractors.CmsExtractionResult;
041import org.opencms.search.extractors.I_CmsExtractionResult;
042import org.opencms.util.CmsStringUtil;
043import org.opencms.xml.A_CmsXmlDocument;
044import org.opencms.xml.containerpage.CmsContainerElementBean;
045import org.opencms.xml.containerpage.CmsContainerPageBean;
046import org.opencms.xml.containerpage.CmsFormatterConfiguration;
047import org.opencms.xml.containerpage.CmsXmlContainerPage;
048import org.opencms.xml.containerpage.CmsXmlContainerPageFactory;
049import org.opencms.xml.content.CmsXmlContentFactory;
050import org.opencms.xml.types.I_CmsXmlContentValue;
051
052import java.util.LinkedHashMap;
053import java.util.List;
054import java.util.Locale;
055
056import org.apache.commons.logging.Log;
057
058/**
059 * Lucene document factory class to extract index data from a resource
060 * of type <code>CmsResourceTypeContainerPage</code>.<p>
061 *
062 * @since 8.0
063 */
064public class CmsDocumentContainerPage extends A_CmsVfsDocument {
065
066    /** The log object for this class. */
067    private static final Log LOG = CmsLog.getLog(CmsDocumentContainerPage.class);
068
069    /**
070     * Creates a new instance of this lucene document factory.<p>
071     *
072     * @param name name of the document type
073     */
074    public CmsDocumentContainerPage(String name) {
075
076        super(name);
077    }
078
079    /**
080     * Generates a new lucene document instance from contents of the given resource for the provided index.<p>
081     *
082     * For container pages, we must not cache based on the container page content age,
083     * since the content of the included elements may change any time.
084     */
085    @Override
086    public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index)
087    throws CmsException {
088
089        // extract the content from the resource
090        I_CmsExtractionResult content = null;
091
092        if (index.isExtractingContent()) {
093            // do full text content extraction only if required
094
095            try {
096                content = extractContent(cms, resource, index);
097            } catch (Exception e) {
098                // text extraction failed for document - continue indexing meta information only
099                LOG.error(Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e);
100            }
101        }
102
103        // create the Lucene document according to the index field configuration
104        return index.getFieldConfiguration().createDocument(cms, resource, index, content);
105    }
106
107    /**
108     * Returns the raw text content of a VFS resource of type <code>CmsResourceTypeContainerPage</code>.<p>
109     *
110     * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, I_CmsSearchIndex)
111     */
112    public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, I_CmsSearchIndex index)
113    throws CmsException {
114
115        logContentExtraction(resource, index);
116        try {
117            CmsFile file = readFile(cms, resource);
118            CmsXmlContainerPage containerPage = CmsXmlContainerPageFactory.unmarshal(cms, file);
119            Locale locale = index.getLocaleForResource(cms, resource, null);
120
121            // initialize return values
122            StringBuffer content = new StringBuffer();
123            LinkedHashMap<String, String> items = new LinkedHashMap<String, String>();
124
125            CmsContainerPageBean containerBean = containerPage.getContainerPage(cms);
126            for (CmsContainerElementBean element : containerBean.getElements()) {
127                // check all elements in this container
128
129                // get the formatter configuration for this element
130                element.initResource(cms);
131                CmsADEConfigData adeConfig = OpenCms.getADEManager().lookupConfiguration(cms, file.getRootPath());
132                CmsFormatterConfiguration formatters = adeConfig.getFormatters(cms, element.getResource());
133
134                if (formatters.isSearchContent(element.getFormatterId())) {
135                    // the content of this element must be included for the container page
136
137                    element.initResource(cms);
138                    CmsFile elementFile = readFile(cms, element.getResource());
139                    A_CmsXmlDocument elementContent = CmsXmlContentFactory.unmarshal(cms, elementFile);
140                    List<String> elementNames = elementContent.getNames(locale);
141                    for (String xpath : elementNames) {
142                        // xpath will have the form "Text[1]" or "Nested[1]/Text[1]"
143                        I_CmsXmlContentValue value = elementContent.getValue(xpath, locale);
144                        if (value.getContentDefinition().getContentHandler().isSearchable(value)) {
145                            // the content value is searchable
146                            String extracted = value.getPlainText(cms);
147                            if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(extracted)) {
148                                items.put(elementFile.getRootPath() + "/" + xpath, extracted);
149                                content.append(extracted);
150                                content.append('\n');
151                            }
152                        }
153                    }
154                }
155            }
156
157            return new CmsExtractionResult(content.toString(), items);
158
159        } catch (Exception e) {
160            throw new CmsIndexException(
161                Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()),
162                e);
163        }
164    }
165
166    /**
167     * @see org.opencms.search.documents.I_CmsDocumentFactory#isLocaleDependend()
168     */
169    public boolean isLocaleDependend() {
170
171        return true;
172    }
173
174    /**
175     * @see org.opencms.search.documents.I_CmsDocumentFactory#isUsingCache()
176     */
177    public boolean isUsingCache() {
178
179        return true;
180    }
181}