Logo Search packages:      
Sourcecode: qt4-x11 version File versions

Analyzers.h

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_analysis_Analyzers_
#define _lucene_analysis_Analyzers_

#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif

#include "CLucene/util/Reader.h"
#include "AnalysisHeader.h"
#include "CLucene/util/Misc.h"

CL_NS_DEF(analysis)

/** An abstract base class for simple, character-oriented tokenizers.*/
00021 class CharTokenizer:public Tokenizer {
private:
      int32_t offset, bufferIndex, dataLen;
      TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
      const TCHAR* ioBuffer;
protected:
    
    /** Returns true iff a character should be included in a token.  This
    * tokenizer generates as tokens adjacent sequences of characters which
    * satisfy this predicate.  Characters for which this is false are used to
    * define token boundaries and are not included in tokens. */
      virtual bool isTokenChar(const TCHAR c) const = 0;

    /** Called on each token character to normalize it before it is added to the
    * token.  The default implementation does nothing.  Subclasses may use this
    * to, e.g., lowercase tokens. */
      virtual TCHAR normalize(const TCHAR c) const;

public:
      CharTokenizer(CL_NS(util)::Reader* in);
      virtual ~CharTokenizer(){
      }
      bool next(Token* token);
};


/** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
to say, it defines tokens as maximal strings of adjacent letters, as defined
by java.lang.Character.isLetter() predicate.

Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
00053 class LetterTokenizer:public CharTokenizer {
public:
      // Construct a new LetterTokenizer. 
      LetterTokenizer(CL_NS(util)::Reader* in):
      CharTokenizer(in) {}

    ~LetterTokenizer(){}
protected:
    /** Collects only characters which satisfy _istalpha.*/
      bool isTokenChar(const TCHAR c) const;
};



/**
* LowerCaseTokenizer performs the function of LetterTokenizer
* and LowerCaseFilter together.  It divides text at non-letters and converts
* them to lower case.  While it is functionally equivalent to the combination
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
* to doing the two tasks at once, hence this (redundant) implementation.
* <P>
* Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces.
*/
00077 class LowerCaseTokenizer:public LetterTokenizer {
public:
      /** Construct a new LowerCaseTokenizer. */
00080       LowerCaseTokenizer(CL_NS(util)::Reader* in):
      LetterTokenizer(in) {}

    ~LowerCaseTokenizer(){}
protected:
      /** Collects only characters which satisfy _totlower. */
      TCHAR normalize(const TCHAR chr) const;
};


/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
 * Adjacent sequences of non-Whitespace characters form tokens. */
00092 class WhitespaceTokenizer: public CharTokenizer {
public:
      /** Construct a new WhitespaceTokenizer. */ 
00095       WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
      ~WhitespaceTokenizer(){}
protected:
      /** Collects only characters which do not satisfy _istspace.
      */
      bool isTokenChar(const TCHAR c) const;
};


/** An Analyzer that uses WhitespaceTokenizer. */
00105 class WhitespaceAnalyzer: public Analyzer {
 public:
  TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
  ~WhitespaceAnalyzer(){}
};

/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
00112 class SimpleAnalyzer: public Analyzer {
public:
      TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
      ~SimpleAnalyzer(){}
};



/**
* Normalizes token text to lower case.
*/
00123 class LowerCaseFilter: public TokenFilter {
public:
      LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
      ~LowerCaseFilter(){}
      bool next(Token* token);
};


/**
 * Removes stop words from a token stream.
 */
00134 class StopFilter: public TokenFilter {
private:
      //bvk: i found this to work faster with a non-hash table. the number of items
      //in the stop table is not like to make it worth having hashing.
      CL_NS(util)::CLSetList<const TCHAR*>* table;
public:
      // Constructs a filter which removes words from the input
      //    TokenStream that are named in the array of words. 
      StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords);

      ~StopFilter(){}

      /** Constructs a filter which removes words from the input
      *     TokenStream that are named in the CLSetList.
      */
00149       StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable):
            TokenFilter(in, deleteTokenStream),
            table(stopTable)
      {} 
        
      
      /**
      * Builds a Hashtable from an array of stop words, appropriate for passing
      * into the StopFilter constructor.  This permits this table construction to
      * be cached once when an Analyzer is constructed. 
      * Note: the stopWords list must be a static list because the strings are not copied
      */
      static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable,
                                      const TCHAR** stopWords);

      /**
      * Returns the next input Token whose termText() is not a stop word.
      */ 
      bool next(Token* token);
};




/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
00174 class StopAnalyzer: public Analyzer {
    CL_NS(util)::CLSetList<const TCHAR*> stopTable;

public:
    /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
    StopAnalyzer();
    ~StopAnalyzer();
    
    /** Builds an analyzer which removes words in the provided array. */
    StopAnalyzer( const TCHAR** stopWords );
    /** Filters LowerCaseTokenizer with StopFilter. */
    TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
      
      /** An array containing some common English words that are not usually useful
    for searching. */
00189     static const TCHAR* ENGLISH_STOP_WORDS[];
};



/**
 * This analyzer is used to facilitate scenarios where different
 * fields require different analysis techniques.  Use {@link #addAnalyzer}
 * to add a non-default analyzer on a field name basis.
 * 
 * <p>Example usage:
 * 
 * <pre>
 *   PerFieldAnalyzerWrapper aWrapper =
 *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
 *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
 *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
 * </pre>
 * 
 * <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
 * and "lastname", for which KeywordAnalyzer will be used.
 * 
 * <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
 * and query parsing.
 */
00214 class PerFieldAnalyzerWrapper : public Analyzer {
private:
    Analyzer* defaultAnalyzer;
    CL_NS(util)::CLHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
    CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
public:
    /**
    * Constructs with default analyzer.
    *
    * @param defaultAnalyzer Any fields not specifically
    * defined to use a different analyzer will use the one provided here.
    */
    PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
    ~PerFieldAnalyzerWrapper();
    
    /**
    * Defines an analyzer to use for the specified field.
    *
    * @param fieldName field name requiring a non-default analyzer
    * @param analyzer non-default analyzer to use for field
    */
    void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
    TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
};


/**
 * A filter that replaces accented characters in the ISO Latin 1 character set 
 * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
 * <p>
 * For instance, '&agrave;' will be replaced by 'a'.
 * <p>
 */
00247 class ISOLatin1AccentFilter: public TokenFilter {
public:
      ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
            TokenFilter(input,deleteTs)
      {
      }
      
      /**
       * To replace accented characters in a String by unaccented equivalents.
       */
      bool next(Token* token);
};


/**
 * Emits the entire input as a single token.
 */
00264 class KeywordTokenizer: public Tokenizer {
private:
    LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
    bool done;
    int bufferSize;
public:
    KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
    virtual ~KeywordTokenizer();
    bool next(Token* token);
};

/**
 * "Tokenizes" the entire stream as a single token. This is useful
 * for data like zip codes, ids, and some product names.
 */
00279 class KeywordAnalyzer: public Analyzer {
public:
    TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
    virtual ~KeywordAnalyzer(){}
};

    
/**
 * Removes words that are too long and too short from the stream.
 *
 */
00290 class LengthFilter: public TokenFilter {
private:
    int _min;
    int _max;
public:
    /**
    * Build a filter that removes words that are too long or too
    * short from the text.
    */
    LengthFilter(TokenStream* in, int _min, int _max);
    
    /**
    * Returns the next input Token whose termText() is the right len
    */
    bool next(Token* token);
};


CL_NS_END
#endif

Generated by  Doxygen 1.6.0   Back to index