001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (https://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: https://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: https://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search;
029
030import org.opencms.search.fields.CmsSearchField;
031
032import org.apache.lucene.index.FieldInvertState;
033import org.apache.lucene.search.CollectionStatistics;
034import org.apache.lucene.search.TermStatistics;
035import org.apache.lucene.search.similarities.BM25Similarity;
036import org.apache.lucene.search.similarities.Similarity;
037import org.apache.lucene.util.SmallFloat;
038
039/**
040 * Reduces the importance of the <code>{@link #computeNorm(FieldInvertState)}</code> factor
041 * for the <code>{@link org.opencms.search.fields.CmsLuceneField#FIELD_CONTENT}</code> field, while
042 * keeping the Lucene default for all other fields.<p>
043 *
044 * This implementation was added since apparently the default length norm is heavily biased
045 * for small documents. In the default, even if a term is found in 2 documents the same number of
046 * times, the smaller document (containing less terms) will have a score easily 3x as high as
047 * the longer document. Using this implementation the importance of the term number is reduced.<p>
048 *
049 * Inspired by Chuck Williams WikipediaSimilarity.<p>
050 *
051 * @since 6.0.0
052 */
053public class CmsSearchSimilarity extends Similarity {
054
055    /** Logarithm base 10 used as factor in the score calculations. */
056    private static final double LOG10 = Math.log(10.0);
057
058    /** Similarity implementation the CmsSearchSimilarity is based on. */
059    private static final BM25Similarity m_bm25Sim = new BM25Similarity();
060
061    /**
062     * Creates a new instance of the OpenCms search similarity.<p>
063     */
064    public CmsSearchSimilarity() {
065
066        super(m_bm25Sim.getDiscountOverlaps());
067
068    }
069
070    /**
071     * Special implementation for "compute norm" to reduce the significance of this factor
072     * for the <code>{@link org.opencms.search.fields.CmsLuceneField#FIELD_CONTENT}</code> field, while
073     * keeping the Lucene default for all other fields.<p>
074     */
075    @Override
076    public final long computeNorm(FieldInvertState state) {
077
078        final int numTerms = m_bm25Sim.getDiscountOverlaps()
079        ? state.getLength() - state.getNumOverlap()
080        : state.getLength();
081
082        if (state.getIndexCreatedVersionMajor() >= 7) {
083            return SmallFloat.intToByte4(numTerms);
084        } else {
085            return SmallFloat.floatToByte315(lengthNorm(state, numTerms));
086        }
087    }
088
089    /**
090     * @see org.apache.lucene.search.similarities.Similarity#scorer(float, org.apache.lucene.search.CollectionStatistics, org.apache.lucene.search.TermStatistics[])
091     */
092    @Override
093    public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
094
095        return m_bm25Sim.scorer(boost, collectionStats, termStats);
096    }
097
098    /**
099     * Special implementation for "compute norm" to reduce the significance of this factor
100     * for the <code>{@link org.opencms.search.fields.CmsLuceneField#FIELD_CONTENT}</code> field, while
101     * keeping the Lucene default for all other fields.<p>
102     *
103     * @param state  field invert state
104     * @param numTerms number of terms
105     *
106     * @return the norm as specifically created for OpenCms.
107     *
108     */
109    private float lengthNorm(FieldInvertState state, int numTerms) {
110
111        if (state.getName().equals(CmsSearchField.FIELD_CONTENT)) {
112            numTerms = state.getLength() - state.getNumOverlap();
113            // special length norm for content
114            return (float)(3.0 / (Math.log(1000 + numTerms) / LOG10));
115        }
116        // all other fields use the default Lucene implementation
117        return (float)(1 / Math.sqrt(numTerms));
118    }
119}