Logo Search packages:      
Sourcecode: qt4-x11 version File versions

utf8.cpp

/*
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.      See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 *
 ************************************************
 * Also licensed with permission from Tom Tromey 
 * and Owen Taylor under the Apache license.
 * Original location:
 * http://cvs.gnome.org/viewcvs/glib/glib/gutf8.c?rev=1.50&view=log
 ************************************************
 * 
 * Copyright 2003-2006 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "CLucene/StdHeader.h"

typedef unsigned long  gunichar;
typedef unsigned char  guchar;

#define UTF8_COMPUTE(Char, Mask, Len)                                   \
  if (Char < 128)                                           \
    {                                                       \
      Len = 1;                                                    \
      Mask = 0x7f;                                                \
    }                                                       \
  else if ((Char & 0xe0) == 0xc0)                                 \
    {                                                       \
      Len = 2;                                                    \
      Mask = 0x1f;                                                \
    }                                                       \
  else if ((Char & 0xf0) == 0xe0)                                 \
    {                                                       \
      Len = 3;                                                    \
      Mask = 0x0f;                                                \
    }                                                       \
  else if ((Char & 0xf8) == 0xf0)                                 \
    {                                                       \
      Len = 4;                                                    \
      Mask = 0x07;                                                \
    }                                                       \
  else if ((Char & 0xfc) == 0xf8)                                 \
    {                                                       \
      Len = 5;                                                    \
      Mask = 0x03;                                                \
    }                                                       \
  else if ((Char & 0xfe) == 0xfc)                                 \
    {                                                       \
      Len = 6;                                                    \
      Mask = 0x01;                                                \
    }                                                       \
  else                                                            \
    Len = -1;

/*#define UTF8_LENGTH(Char)              \
  ((Char) < 0x80 ? 1 :                 \
   ((Char) < 0x800 ? 2 :               \
    ((Char) < 0x10000 ? 3 :            \
     ((Char) < 0x200000 ? 4 :          \
      ((Char) < 0x4000000 ? 5 : 6)))))*/


#define UTF8_GET(Result, Chars, Count, Mask, Len)                       \
  (Result) = (Chars)[0] & (Mask);                                 \
  for ((Count) = 1; (Count) < (Len); ++(Count))                         \
    {                                                       \
      if (((Chars)[(Count)] & 0xc0) != 0x80)                            \
            {                                                     \
                  (Result) = -1;                                        \
                  break;                                                \
            }                                                     \
      (Result) <<= 6;                                             \
      (Result) |= ((Chars)[(Count)] & 0x3f);                            \
    }


/**
 * lucene_wctoutf8:
 * @c: a ISO10646 character code
 * @outbuf: output buffer, must have at least 6 bytes of space.
 *       If %NULL, the length will be computed and returned
 *       and nothing will be written to @outbuf.
 *
 * Converts a single character to UTF-8.
 *
 * Return value: number of bytes written
 **/
size_t      lucene_wctoutf8(char * outbuf, const wchar_t ch)
{
  gunichar c = ch;
  guchar len = 0;
  int first;
  int i;

  if (c < 0x80)
    {
      first = 0;
      len = 1;
    }
  else if (c < 0x800)
    {
      first = 0xc0;
      len = 2;
    }
  else if (c < 0x10000)
    {
      first = 0xe0;
      len = 3;
    }
   else if (c < 0x200000)
    {
      first = 0xf0;
      len = 4;
    }
  else if (c < 0x4000000)
    {
      first = 0xf8;
      len = 5;
    }
  else
    {
      first = 0xfc;
      len = 6;
    }

  if (outbuf)
  {
      for (i = len - 1; i > 0; --i)
      {
            outbuf[i] = (char)((c & 0x3f) | 0x80);
            c >>= 6;
      }
      outbuf[0] = c | first;
  }

  return len;
}


/**
 * lucene_utf8towc:
 * @p: a pointer to Unicode character encoded as UTF-8
 *
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 * If @p does not point to a valid UTF-8 encoded character, results are
 * undefined. If you are not sure that the bytes are complete
 * valid Unicode characters, you should use lucene_utf8towc_validated()
 * instead.
 *
 * Return value: the resulting character
 **/
size_t lucene_utf8towc(wchar_t *pwc, const char *p, size_t n)
{
  int i, mask = 0;
  int result;
  unsigned char c = (unsigned char) *p;
  int len=0;

  UTF8_COMPUTE (c, mask, len);
  if (len == -1)
    return 0;
  UTF8_GET (result, p, i, mask, len);

  *pwc = result;
  return len;
}


//this function was not taken from gnome
size_t lucene_wcstoutf8(char * result, const wchar_t * str, size_t result_length){
  char *p=result;
  int i = 0;

  while (p < result + result_length-1 && str[i] != 0)
    p += lucene_wctoutf8(p,str[i++]);

  *p = '\0';

  return p-result;
}
//this function was not taken from gnome
size_t lucene_utf8towcs(wchar_t * result, const char * str, size_t result_length){
  char *sp = (char*)str;
  wchar_t *rp = result;
  int i = 0;

  while (rp < result + result_length && *sp!=0){
    size_t r = lucene_utf8towc(rp,sp,6);
      if ( r == -1 )
            return 0;
      sp += r;
      rp++;
  }

  if ( sp-str < result_length )
      *rp = '\0';

  size_t ret = sp-str;
  return ret;
}
//get the number of bytes that make up the utf8 character.
//this function was not taken from gnome
size_t lucene_utf8charlen(const char *p)
{
  int mask = 0;
  int len=0;
  unsigned char c = (unsigned char) *p;

  UTF8_COMPUTE (c, mask, len);
  return len;
}

Generated by  Doxygen 1.6.0   Back to index