001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.documents;
029
030import org.opencms.file.CmsFile;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsResource;
033import org.opencms.main.CmsException;
034import org.opencms.search.CmsIndexException;
035import org.opencms.search.I_CmsSearchDocument;
036import org.opencms.search.I_CmsSearchIndex;
037import org.opencms.search.extractors.CmsExtractionResult;
038import org.opencms.search.extractors.I_CmsExtractionResult;
039import org.opencms.util.CmsStringUtil;
040import org.opencms.xml.A_CmsXmlDocument;
041import org.opencms.xml.CmsXmlContentDefinition;
042import org.opencms.xml.content.CmsXmlContentFactory;
043import org.opencms.xml.content.I_CmsXmlContentHandler;
044import org.opencms.xml.types.I_CmsXmlContentValue;
045
046import java.util.Iterator;
047import java.util.LinkedHashMap;
048import java.util.List;
049import java.util.Locale;
050
051/**
052 * Lucene document factory class to extract index data from an OpenCms VFS resource
053 * of type <code>CmsResourceTypeXmlContent</code>.<p>
054 *
055 * All XML nodes from the content for all locales will be stored separately in the item map
056 * which you can access using {@link CmsExtractionResult#getContentItems()}. The XML elements will be
057 * accessible using their xpath. The xpath will have the form like for example
058 * <code>Text[1]</code> or <code>Nested[1]/Text[1]</code>.<p>
059 *
060 * @since 6.0.0
061 */
062public class CmsDocumentXmlContent extends A_CmsVfsDocument {
063
064    /**
065     * Creates a new instance of this lucene document factory.<p>
066     *
067     * @param name name of the document type
068     */
069    public CmsDocumentXmlContent(String name) {
070
071        super(name);
072    }
073
074    /**
075     *
076     * @see org.opencms.search.documents.A_CmsVfsDocument#createDocument(org.opencms.file.CmsObject, org.opencms.file.CmsResource, org.opencms.search.I_CmsSearchIndex)
077     */
078    @Override
079    public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, I_CmsSearchIndex index)
080    throws CmsException {
081
082        CmsXmlContentDefinition def = CmsXmlContentDefinition.getContentDefinitionForResource(cms, resource);
083        if (def.getContentHandler().isContainerPageOnly()) {
084            return null;
085        }
086        return super.createDocument(cms, resource, index);
087    }
088
089    /**
090     * Returns the raw text content of a given VFS resource of type <code>CmsResourceTypeXmlContent</code>.<p>
091     *
092     * All XML nodes from the content for all locales will be stored separately in the item map
093     * which you can access using {@link CmsExtractionResult#getContentItems()}. The XML elements will be
094     * accessible using their xpath. The xpath will have the form like for example
095     * <code>Text[1]</code> or <code>Nested[1]/Text[1]</code>.<p>
096     *
097     * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, I_CmsSearchIndex)
098     */
099    public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, I_CmsSearchIndex index)
100    throws CmsException {
101
102        logContentExtraction(resource, index);
103        try {
104            CmsFile file = readFile(cms, resource);
105            A_CmsXmlDocument xmlContent = CmsXmlContentFactory.unmarshal(cms, file);
106            I_CmsXmlContentHandler handler = xmlContent.getHandler();
107            Locale locale = index.getLocaleForResource(cms, resource, xmlContent.getLocales());
108            List<String> elements = xmlContent.getNames(locale);
109            StringBuffer content = new StringBuffer();
110            LinkedHashMap<String, String> items = new LinkedHashMap<String, String>();
111            for (Iterator<String> i = elements.iterator(); i.hasNext();) {
112                String xpath = i.next();
113                // xpath will have the form "Text[1]" or "Nested[1]/Text[1]"
114                I_CmsXmlContentValue value = xmlContent.getValue(xpath, locale);
115                if (handler.isSearchable(value)) {
116                    // the content value is searchable
117                    String extracted = value.getPlainText(cms);
118                    if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(extracted)) {
119                        items.put(xpath, extracted);
120                        content.append(extracted);
121                        content.append('\n');
122                    }
123                }
124            }
125            return new CmsExtractionResult(content.toString(), items);
126        } catch (Exception e) {
127            throw new CmsIndexException(
128                Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()),
129                e);
130        }
131    }
132
133    /**
134     * @see org.opencms.search.documents.I_CmsDocumentFactory#isLocaleDependend()
135     */
136    public boolean isLocaleDependend() {
137
138        return true;
139    }
140
141    /**
142     * @see org.opencms.search.documents.I_CmsDocumentFactory#isUsingCache()
143     */
144    public boolean isUsingCache() {
145
146        return true;
147    }
148}