001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the m_terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.documents;
029
030import org.opencms.i18n.CmsEncoder;
031import org.opencms.main.OpenCms;
032import org.opencms.search.CmsSearchIndex;
033import org.opencms.search.CmsSearchParameters;
034import org.opencms.search.fields.CmsLuceneFieldConfiguration;
035
036import java.io.IOException;
037import java.io.StringReader;
038import java.util.Iterator;
039
040import org.apache.lucene.analysis.Analyzer;
041import org.apache.lucene.analysis.TokenStream;
042import org.apache.lucene.document.Document;
043import org.apache.lucene.search.Query;
044import org.apache.lucene.search.highlight.Highlighter;
045import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
046import org.apache.lucene.search.highlight.QueryTermScorer;
047
048/**
049 * Default highlighter implementation used for generation of search excerpts.<p>
050 *
051 * @since 6.0.0
052 */
053public class CmsTermHighlighterHtml implements I_CmsTermHighlighter {
054
055    /** Separator for the search excerpt fragments. */
056    private static final String EXCERPT_FRAGMENT_SEPARATOR = " ... ";
057
058    /** Fragments required in excerpt. */
059    private static final int EXCERPT_REQUIRED_FRAGMENTS = 5;
060
061    /**
062     * @see org.opencms.search.documents.I_CmsTermHighlighter#getExcerpt(org.apache.lucene.document.Document, org.opencms.search.CmsSearchIndex, org.opencms.search.CmsSearchParameters, org.apache.lucene.search.Query, org.apache.lucene.analysis.Analyzer)
063     */
064    public String getExcerpt(
065        Document doc,
066        CmsSearchIndex index,
067        CmsSearchParameters params,
068        Query query,
069        Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {
070
071        if ((doc == null) || (index == null) || (params == null) || (analyzer == null) || (query == null)) {
072            return null;
073        }
074        if (!(index.getFieldConfiguration() instanceof CmsLuceneFieldConfiguration)) {
075            // also return null if the field configuration is not a lucene field configuration
076            return null;
077        }
078        Highlighter highlighter = null;
079        CmsLuceneFieldConfiguration conf = (CmsLuceneFieldConfiguration)index.getFieldConfiguration();
080        Iterator<String> excerptFieldNames = conf.getExcerptFieldNames().iterator();
081        StringBuffer excerptBuffer = new StringBuffer();
082        while (excerptFieldNames.hasNext()) {
083            String fieldName = excerptFieldNames.next();
084            boolean createExcerpt = !params.isExcerptOnlySearchedFields() || params.getFields().contains(fieldName);
085            if (createExcerpt && (doc.getField(fieldName) != null)) {
086                // only generate field excerpt if the field is available in the document
087                String text = doc.getField(fieldName).stringValue();
088                // make sure all XML in the text is escaped, otherwise excerpt HTML output may be garbled
089                text = CmsEncoder.escapeXml(text);
090
091                TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
092
093                if (params.isExcerptOnlySearchedFields()) {
094                    // highlight the search query only in the matching fields
095                    highlighter = new Highlighter(new QueryTermScorer(query, fieldName));
096                } else {
097                    // highlight search query in all fields
098                    if (highlighter == null) {
099                        highlighter = new Highlighter(new QueryTermScorer(query));
100                    }
101                }
102                String fragment = highlighter.getBestFragments(
103                    stream,
104                    text,
105                    EXCERPT_REQUIRED_FRAGMENTS,
106                    EXCERPT_FRAGMENT_SEPARATOR);
107
108                // kill all unwanted chars in the excerpt
109                fragment = fragment.replace('\t', ' ');
110                fragment = fragment.replace('\n', ' ');
111                fragment = fragment.replace('\r', ' ');
112                fragment = fragment.replace('\f', ' ');
113
114                if (excerptBuffer.length() > 0) {
115                    // this is not the first fragment
116                    excerptBuffer.append(EXCERPT_FRAGMENT_SEPARATOR);
117                }
118                excerptBuffer.append(fragment);
119            }
120        }
121
122        String result = null;
123        if (excerptBuffer.length() > 0) {
124            result = excerptBuffer.toString();
125        }
126
127        int maxLength = OpenCms.getSearchManager().getMaxExcerptLength();
128        if ((result != null) && (result.length() > maxLength)) {
129            result = result.substring(0, maxLength);
130        }
131
132        return result;
133    }
134}