Logo Search packages:      
Sourcecode: qt4-x11 version File versions

TermVectorWriter.cpp

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "TermVector.h"
#include "CLucene/util/Misc.h"

CL_NS_USE(util)
CL_NS_DEF(index)

const char* TermVectorsWriter::LUCENE_TVX_EXTENSION = ".tvx";
const char* TermVectorsWriter::LUCENE_TVD_EXTENSION = ".tvd";
const char* TermVectorsWriter::LUCENE_TVF_EXTENSION = ".tvf";

 TermVectorsWriter::TermVectorsWriter(CL_NS(store)::Directory* directory, 
    const char* segment,FieldInfos* fieldInfos)
 {
    // Open files for TermVector storage
      char fbuf[CL_MAX_NAME];
      strcpy(fbuf,segment);
      char* fpbuf=fbuf+strlen(fbuf);

      strcpy(fpbuf,LUCENE_TVX_EXTENSION);
    tvx = directory->createOutput(fbuf);
    tvx->writeInt(FORMAT_VERSION);

      strcpy(fpbuf,LUCENE_TVD_EXTENSION);
    tvd = directory->createOutput(fbuf);
    tvd->writeInt(FORMAT_VERSION);
      
      strcpy(fpbuf,LUCENE_TVF_EXTENSION);
    tvf = directory->createOutput(fbuf);
    tvf->writeInt(FORMAT_VERSION);

    this->fieldInfos = fieldInfos;

      currentField = NULL;
      currentDocPointer = -1;
  }

  TermVectorsWriter::~TermVectorsWriter(){
      if ( tvx != NULL ){
          tvx->close();
        _CLDELETE(tvx);
      }
      if ( tvd != NULL ){
          tvd->close();
        _CLDELETE(tvd);
      }
      if ( tvf != NULL ){
          tvf->close();
        _CLDELETE(tvf);
      }
  }


  void TermVectorsWriter::openDocument() {
    closeDocument();

    currentDocPointer = tvd->getFilePointer();
  }


  void TermVectorsWriter::closeDocument(){
    if (isDocumentOpen()) {
      closeField();
      writeDoc();
      fields.clear();
      currentDocPointer = -1;
    }
  }


  bool TermVectorsWriter::isDocumentOpen() const{
    return currentDocPointer != -1;
  }


  void TermVectorsWriter::openField(int32_t fieldNumber, bool storePositionWithTermVector, bool storeOffsetWithTermVector){
      if (!isDocumentOpen())
            _CLTHROWA(CL_ERR_InvalidState,"Cannot open field when no document is open.");

    closeField();
    currentField = _CLNEW TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector);
  }
00089   void TermVectorsWriter::openField(const TCHAR* field) {
    FieldInfo* fieldInfo = fieldInfos->fieldInfo(field);
    openField(fieldInfo->number, fieldInfo->storePositionWithTermVector, fieldInfo->storeOffsetWithTermVector);
  }

00094   void TermVectorsWriter::closeField(){
    if (isFieldOpen()) {
      /* DEBUG */
      //System.out.println("closeField()");
      /* DEBUG */

      // save field and terms
      writeField();
      fields.push_back(currentField);
      terms.clear();
      currentField = NULL;
    }
  }

00108   bool TermVectorsWriter::isFieldOpen() const{
    return currentField != NULL;
  }

00112   void TermVectorsWriter::addTerm(const TCHAR* termText, int32_t freq, 
        Array<int32_t>* positions, Array<TermVectorOffsetInfo>* offsets) {
    if (!isDocumentOpen()) 
            _CLTHROWA(CL_ERR_InvalidState,"Cannot add terms when document is not open");
    if (!isFieldOpen())
            _CLTHROWA(CL_ERR_InvalidState,"Cannot add terms when field is not open");

    addTermInternal(termText, freq, positions, offsets);
  }

  void TermVectorsWriter::addTermInternal(const TCHAR* termText, int32_t freq,
        Array<int32_t>* positions, Array<TermVectorOffsetInfo>* offsets) {
    TVTerm* term = _CLNEW TVTerm();
    term->setTermText(termText);
    term->freq = freq;
    term->positions = positions;
    term->offsets = offsets;
    terms.push_back(term);
  }

00132   void TermVectorsWriter::addAllDocVectors(Array<TermFreqVector*>& vectors){
      openDocument();

      for (int32_t i = 0; i < vectors.length; ++i) {
            bool storePositionWithTermVector = false;
            bool storeOffsetWithTermVector = false;

            if ( vectors[i]->__asTermPositionVector() != NULL ) {
                  TermPositionVector* tpVector = vectors[i]->__asTermPositionVector();

                  if (tpVector->size() > 0 && tpVector->getTermPositions(0) != NULL)
                        storePositionWithTermVector = true;
                  if (tpVector->size() > 0 && tpVector->getOffsets(0) != NULL)
                        storeOffsetWithTermVector = true;

                  FieldInfo* fieldInfo = fieldInfos->fieldInfo(tpVector->getField());
                  openField(fieldInfo->number, storePositionWithTermVector, storeOffsetWithTermVector);

                  for (int32_t j = 0; j < tpVector->size(); ++j)
                        addTermInternal(tpVector->getTerms()[j], 
                              (*tpVector->getTermFrequencies())[j], 
                              tpVector->getTermPositions(j),
                              tpVector->getOffsets(j));

                  closeField();

            } else {
                  TermFreqVector* tfVector = vectors[i];

                  FieldInfo* fieldInfo = fieldInfos->fieldInfo(tfVector->getField());
                  openField(fieldInfo->number, storePositionWithTermVector, storeOffsetWithTermVector);

                  for (int32_t j = 0; j < tfVector->size(); ++j)
                        addTermInternal(tfVector->getTerms()[j], 
                              (*tfVector->getTermFrequencies())[j], NULL, NULL);

                  closeField();
            }
      }

    closeDocument();
  }


00176   void TermVectorsWriter::close() {
    try {
      closeDocument();

      // make an effort to close all streams we can but remember and re-throw
      // the first exception encountered in this process
      #define _DOTVWCLOSE(x) if (x != NULL){ \
            try { \
              x->close(); _CLDELETE(x) \
        } catch (CLuceneError& e) { \
          if ( e.number() != CL_ERR_IO ) throw e; \
              if (ikeep==0)ikeep=e.number(); \
          if (keep[0]==0) strcpy(keep,e.what()); \
            } catch (...) { \
                  if (keep[0]==0) strcpy(keep,"Unknown error while closing " #x); \
            } \
        }
      }_CLFINALLY( \
    char keep[200]; \
        int32_t ikeep=0;
        keep[0]=0; \
        _DOTVWCLOSE(tvx); \
        _DOTVWCLOSE(tvd); \
        _DOTVWCLOSE(tvf); \
            if (keep[0] != 0 ) { \
                  _CLTHROWA(ikeep,keep); \
            }
      );
  }

  

  void TermVectorsWriter::writeField()  {
    // remember where this field is written
    currentField->tvfPointer = tvf->getFilePointer();
    //System.out.println("Field Pointer: " + currentField.tvfPointer);
    int32_t size = terms.size();

    tvf->writeVInt(size);

    bool storePositions = currentField->storePositions;
    bool storeOffsets = currentField->storeOffsets;
    uint8_t bits = 0x0;
    if (storePositions) 
      bits |= STORE_POSITIONS_WITH_TERMVECTOR;
    if (storeOffsets) 
      bits |= STORE_OFFSET_WITH_TERMVECTOR;
    tvf->writeByte(bits);

    const TCHAR* lastTermText = LUCENE_BLANK_STRING;
      int32_t lastTermTextLen = 0;

    for (int32_t i = 0; i < size; ++i) {
      TVTerm* term = terms[i];
        int32_t start = CL_NS(util)::Misc::stringDifference(lastTermText, lastTermTextLen, 
              term->getTermText(),term->getTermTextLen());
      int32_t length = term->getTermTextLen() - start;
      tvf->writeVInt(start);                // write shared prefix length
      tvf->writeVInt(length);               // write delta length
      tvf->writeChars(term->getTermText(), start, length);  // write delta chars
      tvf->writeVInt(term->freq);

      lastTermText = term->getTermText();
        lastTermTextLen = term->getTermTextLen();

        if(storePositions){
        if(term->positions == NULL)
                  _CLTHROWA(CL_ERR_IllegalState, "Trying to write positions that are NULL!");
        
        // use delta encoding for positions
        int32_t position = 0;
        for (int32_t j = 0; j < term->freq; ++j){
          tvf->writeVInt((*term->positions)[j] - position);
          position = (*term->positions)[j];
        }
      }
      
      if(storeOffsets){
        if(term->offsets == NULL)
          _CLTHROWA(CL_ERR_IllegalState, "Trying to write offsets that are NULL!");
        
        // use delta encoding for offsets
        int32_t position = 0;
        for (int32_t j = 0; j < term->freq; ++j) {
          tvf->writeVInt((*term->offsets)[j].getStartOffset() - position);
          tvf->writeVInt((*term->offsets)[j].getEndOffset() - (*term->offsets)[j].getStartOffset()); //Save the diff between the two.
          position = (*term->offsets)[j].getEndOffset();
        }
      }
    }
  }




  void TermVectorsWriter::writeDoc()  {
    if (isFieldOpen()) 
            _CLTHROWA(CL_ERR_InvalidState,"Field is still open while writing document");

      // write document index record
    tvx->writeLong(currentDocPointer);

    // write document data record
    int32_t size = fields.size();

    // write the number of fields
    tvd->writeVInt(size);

    // write field numbers
      for (int32_t j = 0; j < size; ++j) {
        tvd->writeVInt(fields[j]->number);
      }

    // write field pointers
    int64_t lastFieldPointer = 0;
      for (int32_t i = 0; i < size; ++i) {
        TVField* field = (TVField*) fields[i];
        tvd->writeVLong(field->tvfPointer - lastFieldPointer);

        lastFieldPointer = field->tvfPointer;
      }
  }


  const TCHAR* TermVectorsWriter::TVTerm::getTermText() const{
      return termText;
  }
    size_t TermVectorsWriter::TVTerm::getTermTextLen(){ 
            if (termTextLen==-1)
                  termTextLen = _tcslen(termText);
            return termTextLen; 
      }
      void TermVectorsWriter::TVTerm::setTermText(const TCHAR* val){ 
        _CLDELETE_CARRAY(termText);
            termText = STRDUP_TtoT(val);
            termTextLen = -1;

      }
      TermVectorsWriter::TVTerm::TVTerm(): 
            freq(0),
            positions(NULL),
            offsets(NULL)
      {
        termText=NULL;  
        termTextLen=-1;
    }
    TermVectorsWriter::TVTerm::~TVTerm(){ 
        _CLDELETE_CARRAY(termText)
    }

CL_NS_END

Generated by  Doxygen 1.6.0   Back to index