001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search;
029
030import org.opencms.search.fields.CmsSearchField;
031
032import org.apache.lucene.index.FieldInvertState;
033import org.apache.lucene.search.CollectionStatistics;
034import org.apache.lucene.search.TermStatistics;
035import org.apache.lucene.search.similarities.BM25Similarity;
036import org.apache.lucene.search.similarities.Similarity;
037import org.apache.lucene.util.SmallFloat;
038
039/**
040 * Reduces the importance of the <code>{@link #computeNorm(FieldInvertState)}</code> factor
041 * for the <code>{@link org.opencms.search.fields.CmsLuceneField#FIELD_CONTENT}</code> field, while
042 * keeping the Lucene default for all other fields.<p>
043 *
044 * This implementation was added since apparently the default length norm is heavily biased
045 * for small documents. In the default, even if a term is found in 2 documents the same number of
046 * times, the smaller document (containing less terms) will have a score easily 3x as high as
047 * the longer document. Using this implementation the importance of the term number is reduced.<p>
048 *
049 * Inspired by Chuck Williams WikipediaSimilarity.<p>
050 *
051 * @since 6.0.0
052 */
053public class CmsSearchSimilarity extends Similarity {
054
055    /** Logarithm base 10 used as factor in the score calculations. */
056    private static final double LOG10 = Math.log(10.0);
057
058    /** Similarity implementation the CmsSearchSimilarity is based on. */
059    private final BM25Similarity m_bm25Sim = new BM25Similarity();
060
061    /**
062     * Creates a new instance of the OpenCms search similarity.<p>
063     */
064    public CmsSearchSimilarity() {
065
066    }
067
068    /**
069     * Special implementation for "compute norm" to reduce the significance of this factor
070     * for the <code>{@link org.opencms.search.fields.CmsLuceneField#FIELD_CONTENT}</code> field, while
071     * keeping the Lucene default for all other fields.<p>
072     */
073    @Override
074    public final long computeNorm(FieldInvertState state) {
075
076        final int numTerms = m_bm25Sim.getDiscountOverlaps()
077        ? state.getLength() - state.getNumOverlap()
078        : state.getLength();
079
080        if (state.getIndexCreatedVersionMajor() >= 7) {
081            return SmallFloat.intToByte4(numTerms);
082        } else {
083            return SmallFloat.floatToByte315(lengthNorm(state, numTerms));
084        }
085    }
086
087    /**
088     * Returns true iff overlap tokens are discounted from the document's length.
089     *
090     * @return true iff overlap tokens are discounted from the document's length.
091     *
092     * @see #setDiscountOverlaps(boolean)
093     */
094    public boolean getDiscountOverlaps() {
095
096        return m_bm25Sim.getDiscountOverlaps();
097    }
098
099    /**
100     * @see org.apache.lucene.search.similarities.Similarity#scorer(float, org.apache.lucene.search.CollectionStatistics, org.apache.lucene.search.TermStatistics[])
101     */
102    @Override
103    public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
104
105        return m_bm25Sim.scorer(boost, collectionStats, termStats);
106    }
107
108    /**
109     * Sets whether overlap tokens (Tokens with 0 position increment) are
110     * ignored when computing norm.  By default this is true, meaning overlap
111     * tokens do not count when computing norms.
112     *
113     * @param v if true, tokens with position increment 0 are ignored when computing the norm, otherwise they are not ignored.
114     */
115    public void setDiscountOverlaps(boolean v) {
116
117        m_bm25Sim.setDiscountOverlaps(v);
118    }
119
120    /**
121     * Special implementation for "compute norm" to reduce the significance of this factor
122     * for the <code>{@link org.opencms.search.fields.CmsLuceneField#FIELD_CONTENT}</code> field, while
123     * keeping the Lucene default for all other fields.<p>
124     *
125     * @param state  field invert state
126     * @param numTerms number of terms
127     *
128     * @return the norm as specifically created for OpenCms.
129     *
130     */
131    private float lengthNorm(FieldInvertState state, int numTerms) {
132
133        if (state.getName().equals(CmsSearchField.FIELD_CONTENT)) {
134            numTerms = state.getLength() - state.getNumOverlap();
135            // special length norm for content
136            return (float)(3.0 / (Math.log(1000 + numTerms) / LOG10));
137        }
138        // all other fields use the default Lucene implementation
139        return (float)(1 / Math.sqrt(numTerms));
140    }
141}