/* * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Field = Lucene.Net.Documents.Field; using IndexReader = Lucene.Net.Index.IndexReader; using IndexWriter = Lucene.Net.Index.IndexWriter; using Term = Lucene.Net.Index.Term; namespace Lucene.Net.Search { /// Expert: Scoring API. ///

Subclasses implement search scoring. /// ///

The score of query q for document d is defined /// in terms of these methods as follows: /// /// /// /// /// /// /// /// /// /// /// ///
score(q,d) =
/// Σ /// ( {@link #Tf(int) tf}(t in d) * /// {@link #Idf(Term,Searcher) idf}(t)^2 * /// {@link Query#getBoost getBoost}(t in q) * /// {@link Field#getBoost getBoost}(t.field in d) * /// {@link #LengthNorm(String,int) lengthNorm}(t.field in d) ) ///  * /// {@link #Coord(int,int) coord}(q,d) * /// {@link #QueryNorm(float) queryNorm}(sumOfSqaredWeights) ///
/// t in q ///
/// ///

where /// /// /// /// /// /// /// /// /// /// ///
sumOfSqaredWeights =
/// Σ /// ( {@link #Idf(Term,Searcher) idf}(t) * /// {@link Query#getBoost getBoost}(t in q) )^2 ///
/// t in q ///
/// ///

Note that the above formula is motivated by the cosine-distance or dot-product /// between document and query vector, which is implemented by {@link DefaultSimilarity}. /// ///

/// /// /// /// /// /// [Serializable] public abstract class Similarity { /// The Similarity implementation used by default. private static Similarity defaultImpl = new DefaultSimilarity(); /// Set the default Similarity implementation used by indexing and search /// code. /// /// /// /// /// /// public static void SetDefault(Similarity similarity) { Similarity.defaultImpl = similarity; } /// Return the default Similarity implementation used by indexing and search /// code. /// ///

This is initially an instance of {@link DefaultSimilarity}. /// ///

/// /// /// /// public static Similarity GetDefault() { return Similarity.defaultImpl; } /// Cache of decoded bytes. private static readonly float[] NORM_TABLE = new float[256]; /// Decodes a normalization factor stored in an index. /// /// public static float DecodeNorm(byte b) { return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 } /// Returns a table for decoding normalization bytes. /// /// public static float[] GetNormDecoder() { return NORM_TABLE; } /// Computes the normalization value for a field given the total number of /// terms contained in a field. These values, together with field boosts, are /// stored in an index and multipled into scores for hits on each field by the /// search code. /// ///

Matches in longer fields are less precise, so implementations of this /// method usually return smaller values when numTokens is large, /// and larger values when numTokens is small. /// ///

That these values are computed under {@link /// IndexWriter#AddDocument(Lucene.Net.document.Document)} and stored then using /// {@link #EncodeNorm(float)}. Thus they have limited precision, and documents /// must be re-indexed if this method is altered. /// ///

/// the name of the field /// /// the total number of tokens contained in fields named /// fieldName of doc. /// /// a normalization factor for hits on this field of this document /// /// /// /// public abstract float LengthNorm(System.String fieldName, int numTokens); /// Computes the normalization value for a query given the sum of the squared /// weights of each of the query terms. This value is then multipled into the /// weight of each query term. /// ///

This does not affect ranking, but rather just attempts to make scores /// from different queries comparable. /// ///

/// the sum of the squares of query term weights /// /// a normalization factor for query weights /// public abstract float QueryNorm(float sumOfSquaredWeights); /// Encodes a normalization factor for storage in an index. /// ///

The encoding uses a five-bit exponent and three-bit mantissa, thus /// representing values from around 7x10^9 to 2x10^-9 with about one /// significant decimal digit of accuracy. Zero is also represented. /// Negative numbers are rounded up to zero. Values too large to represent /// are rounded down to the largest representable value. Positive values too /// small to represent are rounded up to the smallest positive representable /// value. /// ///

/// /// public static byte EncodeNorm(float f) { return FloatToByte(f); } private static float ByteToFloat(byte b) { if (b == 0) // zero is a special case return 0.0f; int mantissa = b & 7; int exponent = (b >> 3) & 31; int bits = ((exponent + (63 - 15)) << 24) | (mantissa << 21); return BitConverter.ToSingle(BitConverter.GetBytes(bits), 0); } private static byte FloatToByte(float f) { if (f < 0.0f) // round negatives up to zero f = 0.0f; if (f == 0.0f) // zero is a special case return 0; int bits = BitConverter.ToInt32(BitConverter.GetBytes(f), 0); // parse float into parts int mantissa = (bits & 0xffffff) >> 21; int exponent = (((bits >> 24) & 0x7f) - 63) + 15; if (exponent > 31) { // overflow: use max value exponent = 31; mantissa = 7; } if (exponent < 0) { // underflow: use min value exponent = 0; mantissa = 1; } return (byte) ((exponent << 3) | mantissa); // pack into a byte } /// Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the {@link #Idf(Term, Searcher)} /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

The default implementation calls {@link #Tf(float)}. /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public virtual float Tf(int freq) { return Tf((float) freq); } /// Computes the amount of a sloppy phrase match, based on an edit distance. /// This value is summed for each sloppy phrase match in a document to form /// the frequency that is passed to {@link #Tf(float)}. /// ///

A phrase match with a small edit distance to a document passage more /// closely matches the document, so implementations of this method usually /// return larger values when the edit distance is small and smaller values /// when it is large. /// ///

/// /// /// the edit distance of this sloppy phrase match /// /// the frequency increment for this match /// public abstract float SloppyFreq(int distance); /// Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the {@link #Idf(Term, Searcher)} /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public abstract float Tf(float freq); /// Computes a score factor for a simple term. /// ///

The default implementation is:

		/// return idf(searcher.docFreq(term), searcher.maxDoc());
		/// 
/// /// Note that {@link Searcher#MaxDoc()} is used instead of /// {@link IndexReader#NumDocs()} because it is proportional to /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate, /// so is the other, and in the same direction. /// ///
/// the term in question /// /// the document collection being searched /// /// a score factor for the term /// public virtual float Idf(Term term, Searcher searcher) { return Idf(searcher.DocFreq(term), searcher.MaxDoc()); } /// Computes a score factor for a phrase. /// ///

The default implementation sums the {@link #Idf(Term,Searcher)} factor /// for each term in the phrase. /// ///

/// the terms in the phrase /// /// the document collection being searched /// /// a score factor for the phrase /// public virtual float Idf(System.Collections.ICollection terms, Searcher searcher) { float idf = 0.0f; System.Collections.IEnumerator i = terms.GetEnumerator(); while (i.MoveNext()) { idf += Idf((Term) i.Current, searcher); } return idf; } /// Computes a score factor based on a term's document frequency (the number /// of documents which contain the term). This value is multiplied by the /// {@link #Tf(int)} factor for each term in the query and these products are /// then summed to form the initial score for a document. /// ///

Terms that occur in fewer documents are better indicators of topic, so /// implementations of this method usually return larger values for rare terms, /// and smaller values for common terms. /// ///

/// the number of documents which contain the term /// /// the total number of documents in the collection /// /// a score factor based on the term's document frequency /// public abstract float Idf(int docFreq, int numDocs); /// Computes a score factor based on the fraction of all query terms that a /// document contains. This value is multiplied into scores. /// ///

The presence of a large portion of the query terms indicates a better /// match with the query, so implementations of this method usually return /// larger values when the ratio between these parameters is large and smaller /// values when the ratio between them is small. /// ///

/// the number of query terms matched in the document /// /// the total number of terms in the query /// /// a score factor based on term overlap with the query /// public abstract float Coord(int overlap, int maxOverlap); static Similarity() { { for (int i = 0; i < 256; i++) NORM_TABLE[i] = ByteToFloat((byte) i); } } } }