Logo Search packages:      
Sourcecode: qt4-x11 version File versions

tokenizer.cpp

/****************************************************************************
**
** Copyright (C) 2004-2007 Trolltech ASA. All rights reserved.
** Copyright (C) 2001-2004 Roberto Raggi
**
** This file is part of the qt3to4 porting application of the Qt Toolkit.
**
** This file may be used under the terms of the GNU General Public
** License version 2.0 as published by the Free Software Foundation
** and appearing in the file LICENSE.GPL included in the packaging of
** this file.  Please review the following information to ensure GNU
** General Public Licensing requirements will be met:
** http://www.trolltech.com/products/qt/opensource.html
**
** If you are unsure which license is appropriate for your use, please
** review the following information:
** http://www.trolltech.com/products/qt/licensing.html or contact the
** sales department at sales@trolltech.com.
**
** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
**
****************************************************************************/

#include "tokenizer.h"
#include "tokens.h"
#include <QDateTime>
#include <QHash>
#include <ctype.h>

using TokenEngine::Token;

static QHash<QByteArray, bool> preprocessed;
bool Tokenizer::s_initialized = false;
Tokenizer::scan_fun_ptr Tokenizer::s_scan_table[128 + 1];
int Tokenizer::s_attr_table[256];

Tokenizer::Tokenizer()
{
    if (!s_initialized)
        setupScanTable();
}

Tokenizer::~Tokenizer()
{
}

enum
{
    A_Alpha = 0x01,
    A_Digit = 0x02,
    A_Alphanum = A_Alpha | A_Digit,
    A_Whitespace = 0x04
};

void Tokenizer::setupScanTable()
{
    s_initialized = true;

    memset(s_attr_table, 0, 256);

    for (int i=0; i<128; ++i) {
        switch (i) {
        case ':':
        case '*':
        case '%':
        case '^':
        case '=':
        case '!':
        case '&':
        case '|':
        case '+':
        case '<':
        case '>':
        case '-':
        case '.':
            s_scan_table[i] = &Tokenizer::scanOperator;
            break;

        case '\r':
        case '\n':
            s_scan_table[i] = &Tokenizer::scanNewline;
            break;

        case '#':
            s_scan_table[i] = &Tokenizer::scanPreprocessor;
            break;

        case '/':
            s_scan_table[i] = &Tokenizer::scanComment;
            break;

        case '\'':
            s_scan_table[i] = &Tokenizer::scanCharLiteral;
            break;

        case '"':
            s_scan_table[i] = &Tokenizer::scanStringLiteral;
            break;

        default:
            if (isspace(i)) {
                s_scan_table[i] = &Tokenizer::scanWhiteSpaces;
                s_attr_table[i] |= A_Whitespace;
            } else if (isalpha(i) || i == '_') {
                s_scan_table[i] = &Tokenizer::scanIdentifier;
                s_attr_table[i] |= A_Alpha;
            } else if (isdigit(i)) {
                s_scan_table[i] = &Tokenizer::scanNumberLiteral;
                s_attr_table[i] |= A_Digit;
            } else
                s_scan_table[i] = &Tokenizer::scanChar;
        }
    }

    s_scan_table[128] = &Tokenizer::scanUnicodeChar;
}

QVector<TokenEngine::Token> Tokenizer::tokenize(QByteArray text)
{
    m_tokens.clear();

    m_buffer = text;
    m_ptr = 0;

    // tokenize
    for (;;) {
        Token tk;
        bool endOfFile = nextToken(tk);
        if (endOfFile) {
            break;
        }
        m_tokens.append(tk);
    }

    return m_tokens;
}

bool Tokenizer::nextToken(Token &tok)
{
    int start = m_ptr;
    unsigned char ch = (unsigned char)m_buffer[m_ptr];

    int kind = 0;
    (this->*s_scan_table[ch < 128 ? ch : 128])(&kind);

    tok.start = start;
    tok.length = m_ptr - start;

    return (kind == 0);
}

void Tokenizer::scanChar(int *kind)
{
    *kind = m_buffer[m_ptr++];
}

void Tokenizer::scanWhiteSpaces(int *kind)
{
    *kind = Token_whitespaces;
    while (unsigned char ch = m_buffer[m_ptr]) {
        if (s_attr_table[ch] & A_Whitespace)
            ++m_ptr;
        else
            break;
    }
}

void Tokenizer::scanNewline(int *kind)
{
    Q_UNUSED(kind);
    const unsigned char ch = m_buffer[m_ptr++];
    // Check for \n.
    if (ch == '\n') {
        *kind = '\n';
        return;
    }

    // Check for \r\n.
    if (ch == '\r' && m_buffer[m_ptr] == '\n') {
        *kind = '\n';
        ++ m_ptr;
        return;
    }

    *kind = ch;
}

void Tokenizer::scanUnicodeChar(int *kind)
{
    *kind = m_buffer[m_ptr++];
}

void Tokenizer::scanCharLiteral(int *kind)
{
    ++m_ptr;
    for (;;) {
        unsigned char ch = m_buffer[m_ptr];
        switch (ch) {
        case '\0':
        case '\n':
            // ### error
            *kind = Token_char_literal;
            return;
        case '\\':
            if (m_buffer[m_ptr+1] == '\'' || m_buffer[m_ptr+1] == '\\')
                m_ptr += 2;
            else
                ++m_ptr;
            break;
        case '\'':
            ++m_ptr;
            *kind = Token_char_literal;
            return;
        default:
            ++m_ptr;
            break;
        }
    }

    // ### error
    *kind = Token_char_literal;
}

void Tokenizer::scanStringLiteral(int *kind)
{
    ++m_ptr;
    while (m_buffer[m_ptr]) {
        switch (m_buffer[m_ptr]) {
        case '\n':
            // ### error
            *kind = Token_string_literal;
            return;
        case '\\':
            if (m_buffer[m_ptr+1] == '"' || m_buffer[m_ptr+1] == '\\')
                m_ptr += 2;
            else
                ++m_ptr;
            break;
        case '"':
            ++m_ptr;
            *kind = Token_string_literal;
            return;
        default:
            ++m_ptr;
            break;
        }
    }

    // ### error
    *kind = Token_string_literal;
}

void Tokenizer::scanIdentifier(int *kind)
{
    unsigned char ch;
    for (;;) {
        ch = m_buffer[m_ptr];
        if (s_attr_table[ch] & A_Alphanum)
            ++m_ptr;
        else
            break;
    }
    *kind = Token_identifier;
}

void Tokenizer::scanNumberLiteral(int *kind)
{
    unsigned char ch;
    for (;;) {
        ch = m_buffer[m_ptr];
        if (s_attr_table[ch] & A_Alphanum || ch == '.')
            ++m_ptr;
        else
            break;
    }

    // ### finish to implement me!!
    *kind = Token_number_literal;
}

void Tokenizer::scanComment(int *kind)
{
    if (!(m_buffer[m_ptr+1] == '/' || m_buffer[m_ptr+1] == '*')) {
        scanOperator(kind);
            return;
      }

    ++m_ptr; // skip '/'

    bool multiLineComment = m_buffer[m_ptr++] == '*';

    while (m_buffer[m_ptr]) {
        switch (m_buffer[m_ptr]) {
        case '\r':
        case '\n':
            if (!multiLineComment) {
                *kind = Token_comment;
                return;
            }

            (void) scanNewline(kind);
            break;

        case '*':
            if (multiLineComment && m_buffer[m_ptr+1] == '/') {
                m_ptr += 2;
                *kind = Token_comment;
                return;
            }
            ++m_ptr;
            break;

        default:
            ++m_ptr;
        }
    }

    // ### error
    *kind = Token_comment;
}


void Tokenizer::scanPreprocessor(int *kind)
{
    ++m_ptr;
    *kind = Token_preproc;
}


void Tokenizer::scanOperator(int *kind)
{
    switch (m_buffer[m_ptr]) {
    case ':':
        if (m_buffer[m_ptr+1] == ':') {
            m_ptr += 2;
            *kind = Token_scope;
            return;
        }
        break;

    case '*':
    case '/':
    case '%':
    case '^':
        if (m_buffer[m_ptr+1] == '=') {
            m_ptr += 2;
            *kind = Token_assign;
            return;
        }
        break;

    case '=':
    case '!':
        if (m_buffer[m_ptr+1] == '=') {
            m_ptr += 2;
            *kind = Token_eq;
            return;
        }
        break;

    case '&':
        if (m_buffer[m_ptr+1] == '&') {
            m_ptr += 2;
            *kind = Token_and;
            return;
        } else if (m_buffer[m_ptr+1] == '=') {
            m_ptr += 2;
            *kind = Token_assign;
            return;
        }
        break;

    case '|':
        if (m_buffer[m_ptr+1] == '|' ) {
            m_ptr += 2;
            *kind = Token_or;
            return;
        } else if (m_buffer[m_ptr+1] == '=') {
            m_ptr += 2;
            *kind = Token_assign;
            return;
        }
        break;

    case '+':
        if (m_buffer[m_ptr+1] == '+' ) {
            m_ptr += 2;
            *kind = Token_incr;
            return;
        } else if (m_buffer[m_ptr+1] == '=') {
            m_ptr += 2;
            *kind = Token_assign;
            return;
        }
        break;

    case '<':
        if (m_buffer[m_ptr+1] == '<') {
            if (m_buffer[m_ptr+2] == '=') {
                m_ptr += 3;
                *kind = Token_assign;
                return;
            }
            m_ptr += 2;
            *kind = Token_shift;
            return;
        } else if (m_buffer[m_ptr+1] == '=') {
            m_ptr += 2;
            *kind = Token_leq;
            return;
        }
        break;

    case '>':
        if (m_buffer[m_ptr+1] == '>') {
            if (m_buffer[m_ptr+2] == '=') {
                m_ptr += 3;
                *kind = Token_assign;
                return;
            }
            m_ptr += 2;
            *kind = Token_shift;
            return;
        } else if (m_buffer[m_ptr+1] == '=') {
            m_ptr += 2;
            *kind = Token_geq;
            return;
        }
        break;

    case '-':
        if (m_buffer[m_ptr+1] == '>') {
            if (m_buffer[m_ptr+2] == '*') {
                m_ptr += 3;
                *kind = Token_ptrmem;
                return;
            }
            m_ptr += 2;
            *kind = Token_arrow;
            return;
        } else if (m_buffer[m_ptr+1] == '-') {
            m_ptr += 2;
            *kind = Token_decr;
            return;
        } else if (m_buffer[m_ptr+1] == '=') {
            m_ptr += 2;
            *kind = Token_assign;
            return;
        }
        break;

    case '.':
        if (m_buffer[m_ptr+1] == '.' && m_buffer[m_ptr+2] == '.') {
            m_ptr += 3;
            *kind = Token_ellipsis;
            return;
        } else if (m_buffer[m_ptr+1] == '*') {
            m_ptr += 2;
            *kind = Token_ptrmem;
            return;
        }
        break;

    }

    *kind = m_buffer[m_ptr++];
}

Generated by  Doxygen 1.6.0   Back to index