Logo Search packages:      
Sourcecode: qt4-x11 version File versions

DocumentWriter.cpp

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"

#include "DocumentWriter.h"
#include "FieldInfos.h"
#include "IndexWriter.h"
#include "FieldsWriter.h"
#include "Term.h"
#include "TermInfo.h"
#include "TermInfosWriter.h"

#include "CLucene/analysis/AnalysisHeader.h"

#include "CLucene/search/Similarity.h"
#include "TermInfosWriter.h"
#include "FieldsWriter.h"

CL_NS_USE(util)
CL_NS_USE(store)
CL_NS_USE(analysis)
CL_NS_USE(document)
CL_NS_DEF(index)

/*Posting*/

DocumentWriter::Posting::Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset)
{
//Func - Constructor
//Pre  - t contains a valid reference to a Term
//Post - Instance has been created
      freq = 1;
      
      term = _CL_POINTER(t);
      positions.values = _CL_NEWARRAY(int32_t,1);
      positions.values[0] = position;
      positions.length = 1;
      
      if ( offset != NULL ){
            this->offsets.values = _CL_NEWARRAY(TermVectorOffsetInfo,1);
            this->offsets.values[0] = *offset;
      }
}
DocumentWriter::Posting::~Posting(){
//Func - Destructor
//Pre  - true
//Post - The instance has been destroyed
 
      _CLDELETE_ARRAY(positions.values);
      _CLDECDELETE(term);
}



DocumentWriter::DocumentWriter(Directory* d, Analyzer* a, CL_NS(search)::Similarity* sim, const int32_t mfl):
            analyzer(a),
            directory(d),
            maxFieldLength(mfl),
            fieldInfos(NULL),
            fieldLengths(NULL),
            similarity(sim),
            termIndexInterval( IndexWriter::DEFAULT_TERM_INDEX_INTERVAL ),
            fieldPositions(NULL),
            fieldBoosts(NULL),
            termBuffer(_CLNEW Term){
//Pre  - d contains a valid reference to a Directory
//       d contains a valid reference to a Analyzer
//       mfl > 0 and contains the maximum field length
//Post - Instance has been created
 
CND_PRECONDITION(((mfl > 0) || (mfl == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
 "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
 
   fieldInfos     = NULL;
   fieldLengths   = NULL;
}

DocumentWriter::DocumentWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a, IndexWriter* writer):
            analyzer(a),
            directory(d),
            maxFieldLength(writer->getMaxFieldLength()),
            fieldInfos(NULL),
            fieldLengths(NULL),
            similarity(writer->getSimilarity()),
            termIndexInterval( writer->getTermIndexInterval() ),
            fieldPositions(NULL),
            fieldBoosts(NULL),
            termBuffer(_CLNEW Term){
//Pre  - d contains a valid reference to a Directory
//       d contains a valid reference to a Analyzer
//       mfl > 0 and contains the maximum field length
//Post - Instance has been created
 
CND_PRECONDITION(((maxFieldLength > 0) || (maxFieldLength == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
 "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
 
   fieldInfos     = NULL;
   fieldLengths   = NULL;

}

DocumentWriter::~DocumentWriter(){
//Func - Destructor
//Pre  - true
//Post - The instance has been destroyed
      clearPostingTable();
      _CLDELETE( fieldInfos );
      _CLDELETE_ARRAY(fieldLengths);
      _CLDELETE_ARRAY(fieldPositions);
      _CLDELETE_ARRAY(fieldBoosts);
      _CLDELETE_ARRAY(fieldOffsets);

      _CLDECDELETE(termBuffer);
}

void DocumentWriter::clearPostingTable(){
      PostingTableType::iterator itr = postingTable.begin();
      while ( itr != postingTable.end() ){
            _CLDELETE(itr->second);
            _CLLDECDELETE(itr->first);

            ++itr;
      }
      postingTable.clear();
}

void DocumentWriter::addDocument(const char* segment, Document* doc) {
    CND_PRECONDITION(fieldInfos==NULL, "fieldInfos!=NULL")

      // write field names
      fieldInfos = _CLNEW FieldInfos();
      fieldInfos->add(doc);
      
      const char* buf = Misc::segmentname(segment, ".fnm");
      fieldInfos->write(directory, buf);
      _CLDELETE_CaARRAY(buf);

      // write field values
      FieldsWriter fieldsWriter(directory, segment, fieldInfos);
      try {
            fieldsWriter.addDocument(doc);
      } _CLFINALLY( fieldsWriter.close() );
      
      // invert doc into postingTable
      clearPostingTable();                  // clear postingTable
      fieldLengths = _CL_NEWARRAY(int32_t,fieldInfos->size());    // init fieldLengths
      fieldPositions = _CL_NEWARRAY(int32_t,fieldInfos->size());  // init fieldPositions
      fieldOffsets = _CL_NEWARRAY(int32_t,fieldInfos->size());    // init fieldOffsets
      
      //initialise fieldBoost array with default boost
      int32_t fbl = fieldInfos->size();
      float_t fbd = doc->getBoost();
      fieldBoosts = _CL_NEWARRAY(float_t,fbl);    // init fieldBoosts
      { //msvc6 scope fix
            for ( int32_t i=0;i<fbl;i++ )
                  fieldBoosts[i] = fbd;
      }

      { //msvc6 scope fix
            for ( int32_t i=0;i<fieldInfos->size();i++ )
                  fieldLengths[i] = 0;
      } //msvc6 scope fix
      invertDocument(doc);

      // sort postingTable into an array
      Posting** postings = NULL;
      int32_t postingsLength = 0;
      sortPostingTable(postings,postingsLength);

      //DEBUG:
      /*for (int32_t i = 0; i < postingsLength; i++) {
            Posting* posting = postings[i];
            
            TCHAR* b = posting->term->toString();
            _cout << b << " freq=" << posting->freq;
            _CLDELETE(b);

            _cout << " pos=" << posting->positions[0];
            for (int32_t j = 1; j < posting->freq; j++)
                  _cout <<"," << posting->positions[j];
            
            _cout << endl;
      }*/


      // write postings
      writePostings(postings,postingsLength, segment);

      // write norms of indexed fields
      writeNorms(segment);
      _CLDELETE_ARRAY( postings );
}

void DocumentWriter::sortPostingTable(Posting**& array, int32_t& arraySize) {
      // copy postingTable into an array
      arraySize = postingTable.size();
      array = _CL_NEWARRAY(Posting*,arraySize);
      PostingTableType::iterator postings = postingTable.begin();
      int32_t i=0;
      while ( postings != postingTable.end() ){
            array[i] = (Posting*)postings->second;
            postings++;
            i++;
      }
      // sort the array
      quickSort(array, 0, i - 1);
}


void DocumentWriter::invertDocument(const Document* doc) {
      DocumentFieldEnumeration* fields = doc->fields();
      try {
            while (fields->hasMoreElements()) {
                  Field* field = (Field*)fields->nextElement();
                  const TCHAR* fieldName = field->name();
                const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);
                
                int32_t length = fieldLengths[fieldNumber];     // length of field
                int32_t position = fieldLengths[fieldNumber]; // position in field
                if (length>0) 
                  position+=analyzer->getPositionIncrementGap(fieldName);
                  int32_t offset = fieldOffsets[fieldNumber];       // offset field
        
                if (field->isIndexed()) {
                      if (!field->isTokenized()) { // un-tokenized field
                              //FEATURE: this is bug in java: if using a Reader, then
                              //field value will not be added. With CLucene, an untokenized
                              //field with a reader will still be added (if it isn't stored,
                              //because if it's stored, then the reader has already been read.
                              const TCHAR* charBuf = NULL;
                              int64_t dataLen = 0;

                              if (field->stringValue() == NULL && !field->isStored() ) {
                                    CL_NS(util)::Reader* r = field->readerValue();
                                    // this call tries to read the entire stream
                                    // this may invalidate the string for the further calls
                                    // it may be better to do this via a FilterReader
                                    // TODO make a better implementation of this
                                    dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);
                                    if (dataLen == -1)
                                          dataLen = 0;
                                    //todo: would be better to pass the string length, in case
                                    //a null char is passed, but then would need to test the output too.
                              } else {
                                    charBuf = field->stringValue();
                                    dataLen = _tcslen(charBuf);
                              }
                              
                              if(field->isStoreOffsetWithTermVector()){
                                    TermVectorOffsetInfo tio;
                                    tio.setStartOffset(offset);
                                    tio.setEndOffset(offset + dataLen);
                                    addPosition(fieldName, charBuf, position++, &tio );
                              }else
                                    addPosition(fieldName, charBuf, position++, NULL);
                              offset += dataLen;
                              length++;
                      } else { // field must be tokenized
                          CL_NS(util)::Reader* reader; // find or make Reader
                          bool delReader = false;
                          if (field->readerValue() != NULL) {
                            reader = field->readerValue();
                          } else if (field->stringValue() != NULL) {
                            reader = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false);
                            delReader = true;
                          } else {
                            _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");
                          }
                  
                          try {
                            // Tokenize field and add to postingTable.
                            CL_NS(analysis)::TokenStream* stream = analyzer->tokenStream(fieldName, reader);
                  
                            try {
                              CL_NS(analysis)::Token t;
                              int32_t lastTokenEndOffset = -1;
                              while (stream->next(&t)) {
                                  position += (t.getPositionIncrement() - 1);
                                  
                                  if(field->isStoreOffsetWithTermVector()){
                                    TermVectorOffsetInfo tio;
                                    tio.setStartOffset(offset + t.startOffset());
                                    tio.setEndOffset(offset + t.endOffset());
                                                addPosition(fieldName, t.termText(), position++, &tio);
                                          }else
                                                addPosition(fieldName, t.termText(), position++, NULL);
                                          
                                          lastTokenEndOffset = t.endOffset();
                                  length++;
                                  // Apply field truncation policy.
                                          if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {
                                    // The client programmer has explicitly authorized us to
                                    // truncate the token stream after maxFieldLength tokens.
                                    if ( length > maxFieldLength) {
                                      break;
                                    }
                                          } else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {
                                    const TCHAR* errMsgBase = 
                                      _T("Indexing a huge number of tokens from a single")
                                      _T(" field (\"%s\", in this case) can cause CLucene")
                                      _T(" to use memory excessively.")
                                      _T("  By default, CLucene will accept only %s tokens")
                                      _T(" tokens from a single field before forcing the")
                                      _T(" client programmer to specify a threshold at")
                                      _T(" which to truncate the token stream.")
                                      _T("  You should set this threshold via")
                                                      _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")
                                      _T(" to disable truncation, or a value to specify maximum number of fields).");
                                    
                                    TCHAR defaultMaxAsChar[34];
                                    _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,
                                        defaultMaxAsChar, 10
                                      );
                                                   int32_t errMsgLen = _tcslen(errMsgBase)
                                        + _tcslen(fieldName)
                                        + _tcslen(defaultMaxAsChar);
                                    TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);
                  
                                    _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);
                  
                                            _CLTHROWT_DEL(CL_ERR_Runtime,errMsg);
                                  }
                              } // while token->next
                                    
                                    if(lastTokenEndOffset != -1 )
                                    offset += lastTokenEndOffset + 1;
                            } _CLFINALLY (
                              stream->close();
                              _CLDELETE(stream);
                            );
                          } _CLFINALLY (
                            if (delReader) {
                              _CLDELETE(reader);
                            }
                          );
                      } // if/else field is to be tokenized
                        fieldLengths[fieldNumber] = length; // save field length
                        fieldPositions[fieldNumber] = position;     // save field position
                        fieldBoosts[fieldNumber] *= field->getBoost();
                        fieldOffsets[fieldNumber] = offset;
                  } // if field is to beindexed
            } // while more fields available
      } _CLFINALLY (
        _CLDELETE(fields);
      );
} // Document:;invertDocument


void DocumentWriter::addPosition(const TCHAR* field,
                                     const TCHAR* text,
                                     const int32_t position, 
                                     TermVectorOffsetInfo* offset) {
    
      termBuffer->set(field,text,false);

      Posting* ti = postingTable.get(termBuffer);
      if (ti != NULL) {                     // word seen before
            int32_t freq = ti->freq;
            if (ti->positions.length == freq) {   // positions array is full
                  int32_t* positions = ti->positions.values;
                  int32_t* newPositions = _CL_NEWARRAY(int32_t,freq * 2);       // double size
                  memcpy(newPositions,positions, freq * sizeof(int32_t)); // copy old positions to new
                  _CLDELETE_ARRAY( ti->positions.values );
                  ti->positions.values = newPositions;
                  ti->positions.length = freq*2;
            }
            ti->positions.values[freq] = position;            // add new position
            
            if (offset != NULL) {
                  if (ti->offsets.length == freq){
                        TermVectorOffsetInfo* offsets = ti->offsets.values;
                        TermVectorOffsetInfo* newOffsets = _CL_NEWARRAY(TermVectorOffsetInfo,freq*2);
                        memcpy(newOffsets,offsets,freq * sizeof(TermVectorOffsetInfo));
                        ti->offsets.values = newOffsets;
                        _CLDELETE_ARRAY(offsets);
                  }
                  ti->offsets[freq] = *offset;
            }

            ti->freq = freq + 1;                  // update frequency
      } else {                              // word not seen before
            Term* term = _CLNEW Term( field, text, false);
            postingTable.put(term, _CLNEW Posting(term, position, offset));
      }
}

//static
void DocumentWriter::quickSort(Posting**& postings, const int32_t lo, const int32_t hi) {
      if(lo >= hi)
            return;

      int32_t mid = (lo + hi) / 2;

      if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
             Posting* tmp = postings[lo];
            postings[lo] = postings[mid];
            postings[mid] = tmp;
      }

      if(postings[mid]->term->compareTo(postings[hi]->term) > 0) {
            Posting* tmp = postings[mid];
            postings[mid] = postings[hi];
            postings[hi] = tmp;
            
            if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
                  Posting* tmp2 = postings[lo];
                  postings[lo] = postings[mid];
                  postings[mid] = tmp2;
            }
      }

      int32_t left = lo + 1;
      int32_t right = hi - 1;

      if (left >= right)
            return; 

      const Term* partition = postings[mid]->term; //not kept, so no need to finalize
    
      for( ;; ) {
            while(postings[right]->term->compareTo(partition) > 0)
            --right;
            
            while(left < right && postings[left]->term->compareTo(partition) <= 0)
                  ++left;
                  
            if(left < right) {
                  Posting* tmp = postings[left];
                  postings[left] = postings[right];
                  postings[right] = tmp;
                  --right;
            } else {
                  break;
            }
      }

      quickSort(postings, lo, left);
      quickSort(postings, left + 1, hi);
}

void DocumentWriter::writePostings(Posting** postings, const int32_t postingsLength, const char* segment){
      #define __DOCLOSE(obj) if(obj!=NULL){ try{ obj->close(); _CLDELETE(obj);} catch(CLuceneError &e){ierr=e.number();err=e.what();} catch(...){err="Unknown error while closing posting tables";} }
      IndexOutput* freq = NULL;
      IndexOutput* prox = NULL;
      TermInfosWriter* tis = NULL;
      TermVectorsWriter* termVectorWriter = NULL;
      try {
            //open files for inverse index storage
            const char* buf = Misc::segmentname( segment, ".frq");
            freq = directory->createOutput( buf );
            _CLDELETE_CaARRAY( buf );
            
            buf = Misc::segmentname( segment, ".prx");
            prox = directory->createOutput( buf );
            _CLDELETE_CaARRAY( buf );

            tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos,
                  termIndexInterval);
            TermInfo* ti = _CLNEW TermInfo();
            const TCHAR* currentField = NULL;
            for (int32_t i = 0; i < postingsLength; i++) {
                  Posting* posting = postings[i];

                  // add an entry to the dictionary with pointers to prox and freq files
                  ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);
                  tis->add(posting->term, ti);
                  
                  // add an entry to the freq file
                  int32_t postingFreq = posting->freq;
                  if (postingFreq == 1)                       // optimize freq=1
                        freq->writeVInt(1);                   // set low bit of doc num.
                  else {
                        freq->writeVInt(0);                   // the document number
                        freq->writeVInt(postingFreq);               // frequency in doc
                  }
                  
                  int32_t lastPosition = 0;                   // write positions
                  for (int32_t j = 0; j < postingFreq; ++j) {             // use delta-encoding
                        prox->writeVInt(posting->positions.values[j] - lastPosition);
                        lastPosition = posting->positions.values[j];
                  }

              // check to see if we switched to a new field
              const TCHAR* termField = posting->term->field();
              if ( currentField == NULL || _tcscmp(currentField,termField) != 0 ) { //todo, can we do an intern'd check?
                 // changing field - see if there is something to save
                 currentField = termField;
                 FieldInfo* fi = fieldInfos->fieldInfo(currentField);
                 
                     if (fi->storeTermVector) {
                    if (termVectorWriter == NULL) {
                       termVectorWriter =
                          _CLNEW TermVectorsWriter(directory, segment, fieldInfos);
                       termVectorWriter->openDocument();
                    }
                    termVectorWriter->openField(currentField);
                 } else if (termVectorWriter != NULL) {
                    termVectorWriter->closeField();
                 }
              }
              if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) {
                 termVectorWriter->addTerm(posting->term->text(), postingFreq, &posting->positions, &posting->offsets);
              }
            }
          if (termVectorWriter != NULL)
            termVectorWriter->closeDocument();
            _CLDELETE(ti);
    }_CLFINALLY ( 
        const char* err=NULL;
            int32_t ierr=0;

        // make an effort to close all streams we can but remember and re-throw
        // the first exception encountered in this process
        __DOCLOSE(freq);
        __DOCLOSE(prox);
        __DOCLOSE(tis);
        __DOCLOSE(termVectorWriter);
        if ( err != NULL )
           _CLTHROWA(ierr,err);
    );
}

void DocumentWriter::writeNorms(const char* segment) {
  char fn[CL_MAX_PATH];
  for(int32_t n = 0; n < fieldInfos->size(); n++){
     FieldInfo* fi = fieldInfos->fieldInfo(n);
     if(fi->isIndexed && !fi->omitNorms){
        float_t norm = fieldBoosts[n] * similarity->lengthNorm(fi->name, fieldLengths[n]);

        _snprintf(fn,CL_MAX_PATH,"%s.f%d",segment,n);
        IndexOutput* norms = directory->createOutput(fn);
        try {
           norms->writeByte(CL_NS(search)::Similarity::encodeNorm(norm));
                  }_CLFINALLY ( 
                      norms->close();
                      _CLDELETE(norms);
        )
     }
  }
}
CL_NS_END

Generated by  Doxygen 1.6.0   Back to index