Logo Search packages:      
Sourcecode: qt4-x11 version File versions

SegmentHeader.h

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_index_SegmentHeader_
#define _lucene_index_SegmentHeader_

#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif

#include "SegmentInfos.h"
#include "CLucene/util/BitSet.h"
#include "CLucene/util/VoidMap.h"
#include "Term.h"
#include "FieldInfos.h"
#include "FieldsReader.h"
#include "IndexReader.h"
#include "TermInfosReader.h"
#include "CompoundFile.h"
#include "CLucene/util/ThreadLocal.h"

CL_NS_DEF(index)
class SegmentReader;

class SegmentTermDocs:public virtual TermDocs {
      int32_t _doc;
      int32_t skipInterval;
      int32_t numSkips;
      int32_t skipCount;
      CL_NS(store)::IndexInput* skipStream;
      int32_t skipDoc;
      int64_t freqPointer;
      int64_t proxPointer;
      int64_t skipPointer;
      bool haveSkipped;
protected:
      // SegmentReader parent
      const SegmentReader* parent;
      CL_NS(store)::IndexInput* freqStream;
      int32_t count;
      int32_t df;
      int32_t _freq;
      CL_NS(util)::BitSet* deletedDocs;
public:
    virtual ~SegmentTermDocs();

    virtual void seek(TermEnum* termEnum);
      virtual void seek(Term* term);
      virtual void seek(const TermInfo* ti);

      virtual void close();
      virtual int32_t doc()const;
      virtual int32_t freq()const;

      virtual bool next();

      /** Optimized implementation. */
      virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length);

      /** Optimized implementation. */
      virtual bool skipTo(const int32_t target);

      virtual TermPositions* __asTermPositions();
      
      ///\param Parent must be a segment reader
      SegmentTermDocs( const SegmentReader* Parent);
protected:
      virtual void skippingDoc(){}
      virtual void skipProx(int64_t proxPointer){}
};


class SegmentTermPositions: public SegmentTermDocs, public TermPositions {
private:
      CL_NS(store)::IndexInput* proxStream;
      int32_t proxCount;
      int32_t position;

public:
      ///\param Parent must be a segment reader
      SegmentTermPositions(const SegmentReader* Parent);
      ~SegmentTermPositions();

    void seek(const TermInfo* ti);
      void close();
      int32_t nextPosition();
      bool next();
      int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
      virtual TermDocs* __asTermDocs();
      virtual TermPositions* __asTermPositions();

    //resolve SegmentTermDocs/TermPositions ambiguity
      void seek(Term* term){ SegmentTermDocs::seek(term); }
    void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); }
    int32_t doc() const{ return SegmentTermDocs::doc(); }
      int32_t freq() const{ return SegmentTermDocs::freq(); }
      bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }

protected:
      void skippingDoc();
      /** Called by super.skipTo(). */
    void skipProx(int64_t proxPointer);
};




/**
* An IndexReader responsible for reading 1 segment of an index
*/
00114 class SegmentReader: public IndexReader{
      /**
      * The class Norm represents the normalizations for a field.
      * These normalizations are read from an IndexInput in into an array of bytes called bytes
      */
00119       class Norm :LUCENE_BASE{
            int32_t number;
            SegmentReader* reader;
00122             const char* segment; ///< pointer to segment name
      public:
            CL_NS(store)::IndexInput* in;
            uint8_t* bytes;
            bool dirty;
            //Constructor
            Norm(CL_NS(store)::IndexInput* instrm, int32_t number, SegmentReader* reader, const char* segment);
            //Destructor
            ~Norm();

            void reWrite();
      };
      friend class SegmentReader::Norm;

      //Holds the name of the segment that is being read
      const char* segment;

      //Indicates if there are documents marked as deleted
      bool deletedDocsDirty;
      bool normsDirty;
      bool undeleteAll;

      //Holds all norms for all fields in the segment
      typedef CL_NS(util)::CLHashtable<const TCHAR*,Norm*,CL_NS(util)::Compare::TChar, CL_NS(util)::Equals::TChar> NormsType;
    NormsType _norms; 
    
      uint8_t* ones;
      uint8_t* fakeNorms();

      // Compound File Reader when based on a compound file segment
      CompoundFileReader* cfsReader;
      ///Reads the Field Info file
00154       FieldsReader* fieldsReader;
      TermVectorsReader* termVectorsReaderOrig;
      CL_NS(util)::ThreadLocal<TermVectorsReader*,
            CL_NS(util)::Deletor::Object<TermVectorsReader> >termVectorsLocal;

      void initialize(SegmentInfo* si);

      /**
      * Create a clone from the initial TermVectorsReader and store it in the ThreadLocal.
      * @return TermVectorsReader
      */
      TermVectorsReader* getTermVectorsReader();
      
protected:
      ///Marks document docNum as deleted
      void doDelete(const int32_t docNum);
      void doUndeleteAll();
      void doCommit();
      void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value);

      // can return null if norms aren't stored
      uint8_t* getNorms(const TCHAR* field);
  
public:
      /**
      Func - Constructor.
      Opens all files of a segment
      .fnm     -> Field Info File
                        Field names are stored in the field info file, with suffix .fnm.
      .frq     -> Frequency File
                        The .frq file contains the lists of documents which contain 
                        each term, along with the frequency of the term in that document.
      .prx     -> Prox File
                        The prox file contains the lists of positions that each term occurs
                        at within documents.
      .tis     -> Term Info File
                        This file is sorted by Term. Terms are ordered first lexicographically 
                        by the term's field name, and within that lexicographically by the term's text.
      .del     -> Deletion File
                        The .del file is optional, and only exists when a segment contains deletions
      .f[0-9]* -> Norm File
                        Contains s, for each document, a byte that encodes a value that is 
                        multiplied into the score for hits on that field:
      */
      SegmentReader(SegmentInfo* si);

      SegmentReader(SegmentInfos* sis, SegmentInfo* si);
      ///Destructor.
      virtual ~SegmentReader();

      ///Closes all streams to the files of a single segment
      void doClose();

      ///Checks if a segment managed by SegmentInfo si has deletions
      static bool hasDeletions(const SegmentInfo* si);
    bool hasDeletions() const;
      bool hasNorms(const TCHAR* field) const;

      ///Returns all file names managed by this SegmentReader
      void files(CL_NS(util)::AStringArrayWithDeletor& retarray);
      ///Returns an enumeration of all the Terms and TermInfos in the set.
      TermEnum* terms() const;
      ///Returns an enumeration of terms starting at or after the named term t
      TermEnum* terms(const Term* t) const;

      ///Gets the document identified by n
      bool document(int32_t n, CL_NS(document)::Document* doc);

      ///Checks if the n-th document has been marked deleted
      bool isDeleted(const int32_t n);

      ///Returns an unpositioned TermDocs enumerator.
      TermDocs* termDocs() const;
      ///Returns an unpositioned TermPositions enumerator.
      TermPositions* termPositions() const;

      ///Returns the number of documents which contain the term t
      int32_t docFreq(const Term* t) const;

      ///Returns the actual number of documents in the segment
      int32_t numDocs();
      ///Returns the number of  all the documents in the segment including the ones that have
      ///been marked deleted
      int32_t maxDoc() const;

    ///Returns the bytes array that holds the norms of a named field.
      ///Returns fake norms if norms aren't available
    uint8_t* norms(const TCHAR* field);
      
    ///Reads the Norms for field from disk
      void norms(const TCHAR* field, uint8_t* bytes);
      
      ///concatenating segment with ext and x
      char* SegmentName(const char* ext, const int32_t x=-1);
    ///Creates a filename in buffer by concatenating segment with ext and x
      void SegmentName(char* buffer,int32_t bufferLen,const char* ext, const int32_t x=-1 );

      /**
      * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
      */
      void getFieldNames(FieldOption fldOption, CL_NS(util)::StringArrayWithDeletor& retarray);
    
    static bool usesCompoundFile(SegmentInfo* si);

      /** Return a term frequency vector for the specified document and field. The
      *  vector returned contains term numbers and frequencies for all terms in
      *  the specified field of this document, if the field had storeTermVector
      *  flag set.  If the flag was not set, the method returns null.
      * @throws IOException
      */
    TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field=NULL);

      /** Return an array of term frequency vectors for the specified document.
      *  The array contains a vector for each vectorized field in the document.
      *  Each vector vector contains term numbers and frequencies for all terms
      *  in a given vectorized field.
      *  If no such fields existed, the method returns null.
      * @throws IOException
      */
      bool getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result);
private:
      //Open all norms files for all fields
      void openNorms(CL_NS(store)::Directory* cfsDir);
      //Closes all norms files
      void closeNorms();
      
      ///a bitVector that manages which documents have been deleted
      CL_NS(util)::BitSet* deletedDocs;
      ///an IndexInput to the frequency file
00283       CL_NS(store)::IndexInput* freqStream;
      ///For reading the fieldInfos file
      FieldInfos* fieldInfos;
    ///For reading the Term Dictionary .tis file
00287       TermInfosReader* tis;
      ///an IndexInput to the prox file
      CL_NS(store)::IndexInput* proxStream;\

    static bool hasSeparateNorms(SegmentInfo* si);
      static uint8_t* createFakeNorms(int32_t size);

    //allow various classes to access the internals of this. this allows us to have
    //a more tight idea of the package
    friend class IndexReader;
    friend class IndexWriter;
    friend class SegmentTermDocs;
    friend class SegmentTermPositions;
    friend class MultiReader;
};

CL_NS_END
#endif

Generated by  Doxygen 1.6.0   Back to index