Logo Search packages:      
Sourcecode: qt4-x11 version File versions

TermVector.h

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_index_termvector_h
#define _lucene_index_termvector_h

#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif

#include "FieldInfos.h"

CL_NS_DEF(index)

//predefine classes
struct TermVectorOffsetInfo;
class TermPositionVector; 

/** Provides access to stored term vector of 
 *  a document field.
 */
00025 class TermFreqVector:LUCENE_BASE {
public:
      virtual ~TermFreqVector(){
      }

      /**
      * 
      * @return The field this vector is associated with.
      * 
      */ 
      virtual const TCHAR* getField() = 0;

      /** 
      * @return The number of terms in the term vector.
      */
      virtual int32_t size() = 0;

      /** 
      * @return An Array of term texts in ascending order.
      */
      virtual const TCHAR** getTerms() = 0;


      /** Array of term frequencies. Locations of the array correspond one to one
      *  to the terms in the array obtained from <code>getTerms</code>
      *  method. Each location in the array contains the number of times this
      *  term occurs in the document or the document field.
      *
      *  The size of the returned array is size()
      *  @memory Returning a pointer to internal data. Do not delete.
      */
      virtual const Array<int32_t>* getTermFrequencies() = 0;


      /** Return an index in the term numbers array returned from
      *  <code>getTerms</code> at which the term with the specified
      *  <code>term</code> appears. If this term does not appear in the array,
      *  return -1.
      */
      virtual int32_t indexOf(const TCHAR* term) = 0;


      /** Just like <code>indexOf(int32_t)</code> but searches for a number of terms
      *  at the same time. Returns an array that has the same size as the number
      *  of terms searched for, each slot containing the result of searching for
      *  that term number.
      *
      *  @param terms array containing terms to look for
      *  @param start index in the array where the list of terms starts
      *  @param len the number of terms in the list
      */
      virtual void indexesOf(const TCHAR** terms, const int32_t start, const int32_t len, Array<int32_t>& ret) = 0;

      /** Solve the diamond inheritence problem by providing a reinterpret function.
    * No dynamic casting is required and no RTTI data is needed to do this
    */
      virtual TermPositionVector* __asTermPositionVector()=0;
};


/**
* Writer works by opening a document and then opening the fields within the document and then
* writing out the vectors for each field.
* 
* Rough usage:
*
<CODE>
for each document
{
writer.openDocument();
for each field on the document
{
writer.openField(field);
for all of the terms
{
writer.addTerm(...)
}
writer.closeField
}
writer.closeDocument()    
}
</CODE>
*/
00108 class TermVectorsWriter:LUCENE_BASE {
private:
      class TVField:LUCENE_BASE {
      public:
            int32_t number;
            int64_t tvfPointer;
            int32_t length;   // number of distinct term positions
            bool storePositions;
            bool storeOffsets;
       
            TVField(int32_t number, bool storePos, bool storeOff): 
                        tvfPointer(0),length(0){
                  this->number = number;
                  this->storePositions = storePos;
                  this->storeOffsets = storeOff;
            }
        ~TVField(){}
      };

      class TVTerm:LUCENE_BASE {
            const TCHAR* termText;
            int32_t termTextLen; //textlen cache
            
      public:
            TVTerm();
        ~TVTerm();
            
            int32_t freq;
            Array<int32_t>* positions;
            Array<TermVectorOffsetInfo>* offsets;

            const TCHAR* getTermText() const;
            size_t getTermTextLen();
            void setTermText(const TCHAR* val);
      };


      CL_NS(store)::IndexOutput* tvx, *tvd, *tvf;
      CL_NS(util)::CLVector<TVField*,CL_NS(util)::Deletor::Object<TVField> > fields;
      CL_NS(util)::CLVector<TVTerm*,CL_NS(util)::Deletor::Object<TVTerm> > terms;
      FieldInfos* fieldInfos;

      TVField* currentField;
      int64_t currentDocPointer;

      void addTermInternal(const TCHAR* termText, const int32_t freq, 
            Array<int32_t>* positions, Array<TermVectorOffsetInfo>* offsets);

      void writeField();
      void writeDoc();
  
      void openField(int32_t fieldNumber, bool storePositionWithTermVector, 
      bool storeOffsetWithTermVector);
public:
      LUCENE_STATIC_CONSTANT(int32_t, FORMAT_VERSION = 2);

      //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
      LUCENE_STATIC_CONSTANT(int32_t, FORMAT_SIZE = 4);

      LUCENE_STATIC_CONSTANT(uint8_t, STORE_POSITIONS_WITH_TERMVECTOR = 0x1);
      LUCENE_STATIC_CONSTANT(uint8_t, STORE_OFFSET_WITH_TERMVECTOR = 0x2);
      
      static const char* LUCENE_TVX_EXTENSION;
      static const char* LUCENE_TVD_EXTENSION;
      static const char* LUCENE_TVF_EXTENSION;

      TermVectorsWriter(CL_NS(store)::Directory* directory, const char* segment,
                                       FieldInfos* fieldInfos);

      ~TermVectorsWriter();
      void openDocument();
      void closeDocument();

      /** Close all streams. */
      void close();
      bool isDocumentOpen() const;

      /** Start processing a field. This can be followed by a number of calls to
      *  addTerm, and a final call to closeField to indicate the end of
      *  processing of this field. If a field was previously open, it is
      *  closed automatically.
      */
      void openField(const TCHAR* field);

      /** Finished processing current field. This should be followed by a call to
      *  openField before future calls to addTerm.
      */
      void closeField();

      /** Return true if a field is currently open. */
      bool isFieldOpen() const;

      /**
      * Add a complete document specified by all its term vectors. If document has no
      * term vectors, add value for tvx.
      * 
      * @param vectors
      * @throws IOException
      */
      void addAllDocVectors(Array<TermFreqVector*>& vectors);

      /** Add term to the field's term vector. Field must already be open.
      *  Terms should be added in
      *  increasing order of terms, one call per unique termNum. ProxPointer
      *  is a pointer into the TermPosition file (prx). Freq is the number of
      *  times this term appears in this field, in this document.
      * @throws IllegalStateException if document or field is not open
      */
      void addTerm(const TCHAR* termText, int32_t freq, 
            Array<int32_t>* positions = NULL, Array<TermVectorOffsetInfo>* offsets = NULL);
};

/**
 */
class SegmentTermVector: public virtual TermFreqVector {
private:
      const TCHAR* field;
      TCHAR** terms;
      int32_t termsLen; //cache
      Array<int32_t>* termFreqs;

      int32_t binarySearch(TCHAR** a, const int32_t arraylen, const TCHAR* key) const;
public:
      //note: termFreqs must be the same length as terms
      SegmentTermVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs);
      virtual ~SegmentTermVector();

      /**
      * 
      * @return The number of the field this vector is associated with
      */
      const TCHAR* getField();
      TCHAR* toString() const;
      int32_t size();
      const TCHAR** getTerms();
      const Array<int32_t>* getTermFrequencies();
      int32_t indexOf(const TCHAR* termText);
      void indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret);

      virtual TermPositionVector* __asTermPositionVector();
};



/**
* @version $Id:
*/
00255 class TermVectorsReader:LUCENE_BASE {
private:
    FieldInfos* fieldInfos;
    
    CL_NS(store)::IndexInput* tvx;
    CL_NS(store)::IndexInput* tvd;
    CL_NS(store)::IndexInput* tvf;
    int64_t _size;
    
    int32_t tvdFormat;
    int32_t tvfFormat;
    
    
    int32_t checkValidFormat(CL_NS(store)::IndexInput* in);
    
      void readTermVectors(const TCHAR** fields, const int64_t* tvfPointers, const int32_t len, Array<TermFreqVector*>& _return);

    /**
    * 
    * @param field The field to read in
    * @param tvfPointer The pointer within the tvf file where we should start reading
    * @return The TermVector located at that position
    * @throws IOException
    */
    SegmentTermVector* readTermVector(const TCHAR* field, const int64_t tvfPointer);

    int64_t size();
  
  
      DEFINE_MUTEX(THIS_LOCK)
      TermVectorsReader(const TermVectorsReader& copy);
public:
      TermVectorsReader(CL_NS(store)::Directory* d, const char* segment, FieldInfos* fieldInfos);
      ~TermVectorsReader();

      void close();
      TermVectorsReader* clone() const;

      /**
      * Retrieve the term vector for the given document and field
      * @param docNum The document number to retrieve the vector for
      * @param field The field within the document to retrieve
      * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
      * @throws IOException if there is an error reading the term vector files
      */ 
      TermFreqVector* get(const int32_t docNum, const TCHAR* field);


      /**
      * Return all term vectors stored for this document or null if the could not be read in.
      * 
      * @param docNum The document number to retrieve the vector for
      * @return All term frequency vectors
      * @throws IOException if there is an error reading the term vector files 
      */
      bool get(int32_t docNum, Array<TermFreqVector*>& result);
};


struct TermVectorOffsetInfo {
    int startOffset;
    int endOffset;
public:
      static Array<TermVectorOffsetInfo> EMPTY_OFFSET_INFO;
    TermVectorOffsetInfo();
    ~TermVectorOffsetInfo();
    TermVectorOffsetInfo(int32_t startOffset, int32_t endOffset);
    int32_t getEndOffset() const;
    void setEndOffset(int32_t endOffset);
    int32_t getStartOffset() const;
    void setStartOffset(int32_t startOffset);
    bool equals(TermVectorOffsetInfo* o);
    size_t hashCode() const;
};


/** Extends <code>TermFreqVector</code> to provide additional information about
 *  positions in which each of the terms is found. A TermPositionVector not necessarily
 * contains both positions and offsets, but at least one of these arrays exists.
 */
00335 class TermPositionVector: public virtual TermFreqVector {
public:

    /** Returns an array of positions in which the term is found.
     *  Terms are identified by the index at which its number appears in the
     *  term String array obtained from the <code>indexOf</code> method.
     *  May return null if positions have not been stored.
     */
    virtual Array<int32_t>* getTermPositions(int32_t index) = 0;
  
    /**
     * Returns an array of TermVectorOffsetInfo in which the term is found.
     * May return null if offsets have not been stored.
     * 
     * @see org.apache.lucene.analysis.Token
     * 
     * @param index The position in the array to get the offsets from
     * @return An array of TermVectorOffsetInfo objects or the empty list
     */ 
     virtual Array<TermVectorOffsetInfo>* getOffsets(int32_t index) = 0;
     
     virtual ~TermPositionVector(){
       }
};


class SegmentTermPositionVector: public SegmentTermVector, public TermPositionVector {
protected:
      Array< Array<int32_t> >* positions;
      Array< Array<TermVectorOffsetInfo> >* offsets;
      static Array<int32_t> EMPTY_TERM_POS;
public:
      SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs, Array< Array<int32_t> >* positions, Array< Array<TermVectorOffsetInfo> >* offsets);
      ~SegmentTermPositionVector();

      /**
      * Returns an array of TermVectorOffsetInfo in which the term is found.
      *
      * @param index The position in the array to get the offsets from
      * @return An array of TermVectorOffsetInfo objects or the empty list
      * @see org.apache.lucene.analysis.Token
      */
      Array<TermVectorOffsetInfo>* getOffsets(int32_t index);

      /**
      * Returns an array of positions in which the term is found.
      * Terms are identified by the index at which its number appears in the
      * term String array obtained from the <code>indexOf</code> method.
      */
      Array<int32_t>* getTermPositions(int32_t index);

      const TCHAR* getField(){ return SegmentTermVector::getField(); }
      TCHAR* toString() const{ return SegmentTermVector::toString(); }
      int32_t size(){ return SegmentTermVector::size(); }
      const TCHAR** getTerms(){ return SegmentTermVector::getTerms(); }
      const Array<int32_t>* getTermFrequencies(){ return SegmentTermVector::getTermFrequencies(); }
      int32_t indexOf(const TCHAR* termText){ return SegmentTermVector::indexOf(termText); }
      void indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret)
            { SegmentTermVector::indexesOf(termNumbers, start, len, ret); }

      virtual TermPositionVector* __asTermPositionVector();
};

CL_NS_END
#endif

Generated by  Doxygen 1.6.0   Back to index