001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the m_terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.documents; 029 030import org.opencms.i18n.CmsEncoder; 031import org.opencms.main.OpenCms; 032import org.opencms.search.CmsSearchIndex; 033import org.opencms.search.CmsSearchParameters; 034import org.opencms.search.fields.CmsLuceneFieldConfiguration; 035 036import java.io.IOException; 037import java.io.StringReader; 038import java.util.Iterator; 039 040import org.apache.lucene.analysis.Analyzer; 041import org.apache.lucene.analysis.TokenStream; 042import org.apache.lucene.document.Document; 043import org.apache.lucene.search.Query; 044import org.apache.lucene.search.highlight.Highlighter; 045import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; 046import org.apache.lucene.search.highlight.QueryTermScorer; 047 048/** 049 * Default highlighter implementation used for generation of search excerpts.<p> 050 * 051 * @since 6.0.0 052 */ 053public class CmsTermHighlighterHtml implements I_CmsTermHighlighter { 054 055 /** Separator for the search excerpt fragments. */ 056 private static final String EXCERPT_FRAGMENT_SEPARATOR = " ... "; 057 058 /** Fragments required in excerpt. */ 059 private static final int EXCERPT_REQUIRED_FRAGMENTS = 5; 060 061 /** 062 * @see org.opencms.search.documents.I_CmsTermHighlighter#getExcerpt(org.apache.lucene.document.Document, org.opencms.search.CmsSearchIndex, org.opencms.search.CmsSearchParameters, org.apache.lucene.search.Query, org.apache.lucene.analysis.Analyzer) 063 */ 064 public String getExcerpt( 065 Document doc, 066 CmsSearchIndex index, 067 CmsSearchParameters params, 068 Query query, 069 Analyzer analyzer) throws IOException, InvalidTokenOffsetsException { 070 071 if ((doc == null) || (index == null) || (params == null) || (analyzer == null) || (query == null)) { 072 return null; 073 } 074 if (!(index.getFieldConfiguration() instanceof CmsLuceneFieldConfiguration)) { 075 // also return null if the field configuration is not a lucene field configuration 076 return null; 077 } 078 Highlighter highlighter = null; 079 CmsLuceneFieldConfiguration conf = (CmsLuceneFieldConfiguration)index.getFieldConfiguration(); 080 Iterator<String> excerptFieldNames = conf.getExcerptFieldNames().iterator(); 081 StringBuffer excerptBuffer = new StringBuffer(); 082 while (excerptFieldNames.hasNext()) { 083 String fieldName = excerptFieldNames.next(); 084 boolean createExcerpt = !params.isExcerptOnlySearchedFields() || params.getFields().contains(fieldName); 085 if (createExcerpt && (doc.getField(fieldName) != null)) { 086 // only generate field excerpt if the field is available in the document 087 String text = doc.getField(fieldName).stringValue(); 088 // make sure all XML in the text is escaped, otherwise excerpt HTML output may be garbled 089 text = CmsEncoder.escapeXml(text); 090 091 TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); 092 093 if (params.isExcerptOnlySearchedFields()) { 094 // highlight the search query only in the matching fields 095 highlighter = new Highlighter(new QueryTermScorer(query, fieldName)); 096 } else { 097 // highlight search query in all fields 098 if (highlighter == null) { 099 highlighter = new Highlighter(new QueryTermScorer(query)); 100 } 101 } 102 String fragment = highlighter.getBestFragments( 103 stream, 104 text, 105 EXCERPT_REQUIRED_FRAGMENTS, 106 EXCERPT_FRAGMENT_SEPARATOR); 107 108 // kill all unwanted chars in the excerpt 109 fragment = fragment.replace('\t', ' '); 110 fragment = fragment.replace('\n', ' '); 111 fragment = fragment.replace('\r', ' '); 112 fragment = fragment.replace('\f', ' '); 113 114 if (excerptBuffer.length() > 0) { 115 // this is not the first fragment 116 excerptBuffer.append(EXCERPT_FRAGMENT_SEPARATOR); 117 } 118 excerptBuffer.append(fragment); 119 } 120 } 121 122 String result = null; 123 if (excerptBuffer.length() > 0) { 124 result = excerptBuffer.toString(); 125 } 126 127 int maxLength = OpenCms.getSearchManager().getMaxExcerptLength(); 128 if ((result != null) && (result.length() > maxLength)) { 129 result = result.substring(0, maxLength); 130 } 131 132 return result; 133 } 134}