scanner.cxx


/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */
 
#include <basiccharclass.hxx>
#include <scanner.hxx>
#include <sbintern.hxx>
#include <runtime.hxx>
 
#include <basic/sberrors.hxx>
#include <i18nlangtag/lang.h>
#include <svl/numformat.hxx>
#include <svl/zforlist.hxx>
#include <rtl/character.hxx>
#include <o3tl/string_view.hxx>
#include <utility>
#include <vector>
 
SbiScanner::SbiScanner(OUString _aBuf, StarBASIC* p)
    : aBuf(std::move(_aBuf))
    , nLineIdx(-1)
    , nSaveLineIdx(-1)
    , pBasic(p)
    , eScanType(SbxVARIANT)
    , nVal(0)
    , nSavedCol1(0)
    , nCol(0)
    , nErrors(0)
    , nColLock(0)
    , nBufPos(0)
    , nLine(0)
    , nCol1(0)
    , nCol2(0)
    , bSymbol(false)
    , bNumber(false)
    , bSpaces(false)
    , bAbort(false)
    , bHash(true)
    , bError(false)
    , bCompatible(false)
    , bVBASupportOn(false)
    , bPrevLineExtentsComment(false)
    , bClosingUnderscore(false)
    , bLineEndsWithWhitespace(false)
    , bInStatement(false)
{
}
 
void SbiScanner::LockColumn()
{
    if( !nColLock++ )
        nSavedCol1 = nCol1;
}
 
void SbiScanner::UnlockColumn()
{
    if( nColLock )
        nColLock--;
}
 
void SbiScanner::GenError( ErrCode code )
{
    if( GetSbData()->bBlockCompilerError )
    {
        bAbort = true;
        return;
    }
    if( !bError )
    {
        bool bRes = true;
        // report only one error per statement
        bError = true;
        if( pBasic )
        {
            // in case of EXPECTED or UNEXPECTED it always refers
            // to the last token, so take the Col1 over
            sal_Int32 nc = nColLock ? nSavedCol1 : nCol1;
            if ( code.anyOf(
                    ERRCODE_BASIC_EXPECTED,
                    ERRCODE_BASIC_UNEXPECTED,
                    ERRCODE_BASIC_SYMBOL_EXPECTED,
                    ERRCODE_BASIC_LABEL_EXPECTED) )
            {
                    nc = nCol1;
                    if( nc > nCol2 ) nCol2 = nc;
            }
            bRes = pBasic->CError( code, aError, nLine, nc, nCol2 );
        }
        bAbort = bAbort || !bRes  || ( code == ERRCODE_BASIC_NO_MEMORY || code == ERRCODE_BASIC_PROG_TOO_LARGE );
    }
    nErrors++;
}
 
 
// used by SbiTokenizer::MayBeLabel() to detect a label
bool SbiScanner::DoesColonFollow()
{
    if(nCol < aLine.getLength() && aLine[nCol] == ':')
    {
        ++nLineIdx; ++nCol;
        return true;
    }
    else
        return false;
}
 
// test for legal suffix
static SbxDataType GetSuffixType( sal_Unicode c )
{
    switch (c)
    {
    case '%':
        return SbxINTEGER;
    case '&':
        return SbxLONG;
    case '!':
        return SbxSINGLE;
    case '#':
        return SbxDOUBLE;
    case '@':
        return SbxCURRENCY;
    case '$':
        return SbxSTRING;
    default:
        return SbxVARIANT;
    }
}
 
// reading the next symbol into the variables aSym, nVal and eType
// return value is sal_False at EOF or errors
#define BUF_SIZE 80
 
void SbiScanner::scanAlphanumeric()
{
    sal_Int32 n = nCol;
    while(nCol < aLine.getLength() && (BasicCharClass::isAlphaNumeric(aLine[nCol], bCompatible) || aLine[nCol] == '_'))
    {
        ++nLineIdx;
        ++nCol;
    }
    aSym = aLine.copy(n, nCol - n);
}
 
void SbiScanner::scanGoto()
{
    sal_Int32 n = nCol;
    while(n < aLine.getLength() && BasicCharClass::isWhitespace(aLine[n]))
        ++n;
 
    if(n + 1 < aLine.getLength())
    {
        std::u16string_view aTemp = aLine.subView(n, 2);
        if(o3tl::equalsIgnoreAsciiCase(aTemp, u"to"))
        {
            aSym = "goto";
            nLineIdx += n + 2 - nCol;
            nCol = n + 2;
        }
    }
}
 
bool SbiScanner::readLine()
{
    if(nBufPos >= aBuf.getLength())
        return false;
 
    sal_Int32 n = nBufPos;
    sal_Int32 nLen = aBuf.getLength();
 
    while(n < nLen && aBuf[n] != '\r' && aBuf[n] != '\n')
        ++n;
 
    // Trim trailing whitespace
    sal_Int32 nEnd = n;
    while(nBufPos < nEnd && BasicCharClass::isWhitespace(aBuf[nEnd - 1]))
        --nEnd;
 
    // tdf#149402 - check if line ends with a whitespace
    bLineEndsWithWhitespace = (n > nEnd);
    aLine = aBuf.copy(nBufPos, nEnd - nBufPos);
 
    // Fast-forward past the line ending
    if(n + 1 < nLen && aBuf[n] == '\r' && aBuf[n + 1] == '\n')
        n += 2;
    else if(n < nLen)
        ++n;
 
    nBufPos = n;
    nLineIdx = 0;
 
    ++nLine;
    nCol = nCol1 = nCol2 = 0;
    nColLock = 0;
 
    return true;
}
 
// Function to check if a string is a valid compiler directive
static bool isValidCompilerDirective(std::u16string_view directive) {
    static const std::vector<std::u16string_view> validDirectives = {
        u"if", u"elseif", u"else", u"end", u"const"
    };
 
    return std::any_of(validDirectives.begin(), validDirectives.end(), [&](const auto& valid) {
        return o3tl::matchIgnoreAsciiCase(directive, valid);
    });
}
 
bool SbiScanner::NextSym()
{
    // memorize for the EOLN-case
    sal_Int32 nOldLine = nLine;
    sal_Int32 nOldCol1 = nCol1;
    sal_Int32 nOldCol2 = nCol2;
    sal_Unicode buf[ BUF_SIZE ], *p = buf;
 
    eScanType = SbxVARIANT;
    aSym.clear();
    bHash = bSymbol = bNumber = bSpaces = false;
 
    // read in line?
    if (nLineIdx == -1)
    {
        if(!readLine())
            return false;
 
        nOldLine = nLine;
        nOldCol1 = nOldCol2 = 0;
    }
 
    const sal_Int32 nLineIdxScanStart = nLineIdx;
 
    if(nCol < aLine.getLength() && BasicCharClass::isWhitespace(aLine[nCol]))
    {
        bSpaces = true;
        while(nCol < aLine.getLength() && BasicCharClass::isWhitespace(aLine[nCol]))
        {
            ++nLineIdx;
            ++nCol;
        }
    }
 
    nCol1 = nCol;
 
    // only blank line?
    if(nCol >= aLine.getLength())
        goto eoln;
 
    if( bPrevLineExtentsComment )
        goto PrevLineCommentLbl;
 
    if(nCol < aLine.getLength() && aLine[nCol] == '#')
    {
        sal_Int32 nLineTempIdx = nLineIdx;
        std::u16string_view candidate(aLine.subView(nCol + 1));
 
        do
        {
            nLineTempIdx++;
        } while (nLineTempIdx < aLine.getLength() && !BasicCharClass::isWhitespace(aLine[nLineTempIdx])
            && aLine[nLineTempIdx] != '#' && aLine[nLineTempIdx] != ',');
        // leave it if it is a date literal - it will be handled later
        if (nLineTempIdx >= aLine.getLength() || aLine[nLineTempIdx] != '#')
        {
            ++nLineIdx;
            ++nCol;
            //handle compiler directives (# is first non-space character)
            if (nOldCol2 == 0)
            {
                if (isValidCompilerDirective(candidate))
                {
                    // Skip the whole line if starts with a hash and is a valid compiler directive
                    nCol = 0;
                    goto eoln;
                }
                else
                {
                    GenError(ERRCODE_BASIC_SYNTAX);
                }
            }
            else
                bHash = true;
        }
    }
 
    // copy character if symbol
    if(nCol < aLine.getLength() && (BasicCharClass::isAlpha(aLine[nCol], bCompatible) || aLine[nCol] == '_'))
    {
        // if there's nothing behind '_' , it's the end of a line!
        if(nCol + 1 == aLine.getLength() && aLine[nCol] == '_')
        {
            // Note that nCol is not incremented here...
            ++nLineIdx;
            goto eoln;
        }
 
        bSymbol = true;
 
        scanAlphanumeric();
 
        // Special handling for "go to"
        if(nCol < aLine.getLength() && bCompatible && aSym.equalsIgnoreAsciiCase("go"))
            scanGoto();
 
        // tdf#125637 - check for closing underscore
        if (nCol == aLine.getLength() && aLine[nCol - 1] == '_')
        {
            bClosingUnderscore = true;
        }
        // type recognition?
        // don't test the exclamation mark
        // if there's a symbol behind it
        else if((nCol >= aLine.getLength() || aLine[nCol] != '!') ||
                (nCol + 1 >= aLine.getLength() || !BasicCharClass::isAlpha(aLine[nCol + 1], bCompatible)))
        {
            if(nCol < aLine.getLength())
            {
                SbxDataType t(GetSuffixType(aLine[nCol]));
                if( t != SbxVARIANT )
                {
                    eScanType = t;
                    ++nLineIdx;
                    ++nCol;
                }
            }
        }
    }
 
    // read in and convert if number
    else if((nCol < aLine.getLength() && rtl::isAsciiDigit(aLine[nCol])) ||
            (nCol + 1 < aLine.getLength() && aLine[nCol] == '.' && rtl::isAsciiDigit(aLine[nCol + 1])))
    {
        short exp = 0;
        short dec = 0;
        eScanType = SbxDOUBLE;
        bool bScanError = false;
        bool bBufOverflow = false;
        // All this because of 'D' or 'd' floating point type, sigh...
        while(!bScanError && nCol < aLine.getLength() && strchr("0123456789.DEde", aLine[nCol]))
        {
            // from 4.1.1996: buffer full? -> go on scanning empty
            if( (p-buf) == (BUF_SIZE-1) )
            {
                bBufOverflow = true;
                ++nLineIdx;
                ++nCol;
                continue;
            }
            // point or exponent?
            if(aLine[nCol] == '.')
            {
                if( ++dec > 1 )
                    bScanError = true;
                else
                    *p++ = '.';
            }
            else if(strchr("DdEe", aLine[nCol]))
            {
                if (++exp > 1)
                    bScanError = true;
                else
                {
                    *p++ = 'E';
                    if (nCol + 1 < aLine.getLength() && (aLine[nCol+1] == '+' || aLine[nCol+1] == '-'))
                    {
                        ++nLineIdx;
                        ++nCol;
                        if( (p-buf) == (BUF_SIZE-1) )
                        {
                            bBufOverflow = true;
                            continue;
                        }
                        *p++ = aLine[nCol];
                    }
                }
            }
            else
            {
                *p++ = aLine[nCol];
            }
            ++nLineIdx;
            ++nCol;
        }
        *p = 0;
        aSym = p; bNumber = true;
 
        // For bad characters, scan and parse errors generate only one error.
        ErrCode nError = ERRCODE_NONE;
        if (bScanError)
        {
            --nLineIdx;
            --nCol;
            aError = OUString( aLine[nCol]);
            nError = ERRCODE_BASIC_BAD_CHAR_IN_NUMBER;
        }
 
        rtl_math_ConversionStatus eStatus = rtl_math_ConversionStatus_Ok;
        const sal_Unicode* pParseEnd = buf;
        nVal = rtl_math_uStringToDouble( buf, buf+(p-buf), '.', ',', &eStatus, &pParseEnd );
        if (pParseEnd != buf+(p-buf))
        {
            // e.g. "12e" or "12e+", or with bScanError "12d"+"E".
            sal_Int32 nChars = buf+(p-buf) - pParseEnd;
            nLineIdx -= nChars;
            nCol -= nChars;
            // For bScanError, nLineIdx and nCol were already decremented, just
            // add that character to the parse end.
            if (bScanError)
                ++nChars;
            // Copy error position from original string, not the buffer
            // replacement where "12dE" => "12EE".
            aError = aLine.copy( nCol, nChars);
            nError = ERRCODE_BASIC_BAD_CHAR_IN_NUMBER;
        }
        else if (eStatus != rtl_math_ConversionStatus_Ok)
        {
            // Keep the scan error and character at position, if any.
            if (!nError)
                nError = ERRCODE_BASIC_MATH_OVERFLOW;
        }
 
        if (nError)
            GenError( nError );
 
        if( !dec && !exp )
        {
            if( nVal >= SbxMININT && nVal <= SbxMAXINT )
                eScanType = SbxINTEGER;
            else if( nVal >= SbxMINLNG && nVal <= SbxMAXLNG )
                    eScanType = SbxLONG;
        }
 
        if( bBufOverflow )
            GenError( ERRCODE_BASIC_MATH_OVERFLOW );
 
        // type recognition?
        if( nCol < aLine.getLength() )
        {
            SbxDataType t(GetSuffixType(aLine[nCol]));
            if( t != SbxVARIANT )
            {
                eScanType = t;
                ++nLineIdx;
                ++nCol;
            }
            // tdf#130476 - don't allow String trailing data type character with numbers
            if ( t == SbxSTRING )
            {
                GenError( ERRCODE_BASIC_SYNTAX );
            }
        }
    }
 
    // Hex/octal number? Read in and convert:
    else if(aLine.getLength() - nCol > 1 && aLine[nCol] == '&')
    {
        ++nLineIdx; ++nCol;
        sal_Unicode base = 16;
        sal_Unicode xch  = aLine[nCol];
        ++nLineIdx; ++nCol;
        switch( rtl::toAsciiUpperCase( xch ) )
        {
            case 'O':
                base = 8;
                break;
            case 'H':
                break;
            default :
                // treated as an operator
                --nLineIdx; --nCol; nCol1 = nCol-1;
                aSym = "&";
                return true;
        }
        bNumber = true;
        // Hex literals are signed Integers ( as defined by basic
        // e.g. -2,147,483,648 through 2,147,483,647 (signed)
        sal_uInt64 lu = 0;
        bool bOverflow = false;
        while(nCol < aLine.getLength() && BasicCharClass::isAlphaNumeric(aLine[nCol], false))
        {
            sal_Unicode ch = rtl::toAsciiUpperCase(aLine[nCol]);
            ++nLineIdx; ++nCol;
            if( ((base == 16 ) && rtl::isAsciiHexDigit( ch ) ) ||
                     ((base == 8) && rtl::isAsciiOctalDigit( ch )))
            {
                int i = ch  - '0';
                if( i > 9 ) i -= 7;
                lu = ( lu * base ) + i;
                if( lu > SAL_MAX_UINT32 )
                {
                    bOverflow = true;
                }
            }
            else
            {
                aError = OUString(ch);
                GenError( ERRCODE_BASIC_BAD_CHAR_IN_NUMBER );
            }
        }
 
        // tdf#130476 - take into account trailing data type characters
        if( nCol < aLine.getLength() )
        {
            SbxDataType t(GetSuffixType(aLine[nCol]));
            if( t != SbxVARIANT )
            {
                eScanType = t;
                ++nLineIdx;
                ++nCol;
            }
            // tdf#130476 - don't allow String trailing data type character with numbers
            if ( t == SbxSTRING )
            {
                GenError( ERRCODE_BASIC_SYNTAX );
            }
        }
 
        // tdf#130476 - take into account trailing data type characters
        switch ( eScanType )
        {
            case SbxINTEGER:
                nVal = static_cast<double>( static_cast<sal_Int16>(lu) );
                if ( lu > SbxMAXUINT )
                {
                    bOverflow = true;
                }
                break;
            case SbxLONG: nVal = static_cast<double>( static_cast<sal_Int32>(lu) ); break;
            case SbxVARIANT:
            {
                // tdf#62326 - If the value of the hex string without explicit type character lies within
                // the range of 0x8000 (SbxMAXINT + 1) and 0xFFFF (SbxMAXUINT) inclusive, cast the value
                // to 16 bit in order to get signed integers, e.g., SbxMININT through SbxMAXINT
                sal_Int32 ls = (lu > SbxMAXINT && lu <= SbxMAXUINT) ? static_cast<sal_Int16>(lu) : static_cast<sal_Int32>(lu);
                eScanType = ( ls >= SbxMININT && ls <= SbxMAXINT ) ? SbxINTEGER : SbxLONG;
                nVal = static_cast<double>(ls);
                break;
            }
            default:
                nVal = static_cast<double>(lu);
                break;
        }
        if( bOverflow )
            GenError( ERRCODE_BASIC_MATH_OVERFLOW );
    }
 
    // Strings:
    else if (nLineIdx < aLine.getLength() && (aLine[nLineIdx] == '"' || aLine[nLineIdx] == '['))
    {
        sal_Unicode cSep = aLine[nLineIdx];
        if( cSep == '[' )
        {
            bSymbol = true;
            cSep = ']';
        }
        sal_Int32 n = nCol + 1;
        while (nLineIdx < aLine.getLength())
        {
            do
            {
                nLineIdx++;
                nCol++;
            }
            while (nLineIdx < aLine.getLength() && (aLine[nLineIdx] != cSep));
            if (nLineIdx < aLine.getLength() && aLine[nLineIdx] == cSep)
            {
                nLineIdx++; nCol++;
                if (nLineIdx >= aLine.getLength() || aLine[nLineIdx] != cSep || cSep == ']')
                {
                    // If VBA Interop then doesn't eat the [] chars
                    if ( cSep == ']' && bVBASupportOn )
                        aSym = aLine.copy( n - 1, nCol - n  + 1);
                    else
                        aSym = aLine.copy( n, nCol - n - 1 );
                    // get out duplicate string delimiters
                    OUStringBuffer aSymBuf(aSym.getLength());
                    for ( sal_Int32 i = 0, len = aSym.getLength(); i < len; ++i )
                    {
                        aSymBuf.append( aSym[i] );
                        if ( aSym[i] == cSep && ( i+1 < len ) && aSym[i+1] == cSep )
                            ++i;
                    }
                    aSym = aSymBuf.makeStringAndClear();
                    if( cSep != ']' )
                        eScanType = SbxSTRING;
                    break;
                }
            }
            else
            {
                aError = OUString(cSep);
                GenError( ERRCODE_BASIC_EXPECTED );
            }
        }
    }
 
    // Date:
    else if (nLineIdx < aLine.getLength() && aLine[nLineIdx] == '#')
    {
        sal_Int32 n = nCol + 1;
        do
        {
            nLineIdx++;
            nCol++;
        }
        while (nLineIdx < aLine.getLength() && (aLine[nLineIdx] != '#'));
        if (nLineIdx < aLine.getLength() && aLine[nLineIdx] == '#')
        {
            nLineIdx++; nCol++;
            aSym = aLine.copy( n, nCol - n - 1 );
 
            // parse date literal
            std::shared_ptr<SvNumberFormatter> pFormatter;
            if (GetSbData()->pInst)
            {
                pFormatter = GetSbData()->pInst->GetNumberFormatter();
            }
            else
            {
                sal_uInt32 nDummy;
                pFormatter = SbiInstance::PrepareNumberFormatter( nDummy, nDummy, nDummy );
            }
            sal_uInt32 nIndex = pFormatter->GetStandardIndex( LANGUAGE_ENGLISH_US);
            bool bSuccess = pFormatter->IsNumberFormat(aSym, nIndex, nVal);
            if( bSuccess )
            {
                SvNumFormatType nType_ = pFormatter->GetType(nIndex);
                if( !(nType_ & SvNumFormatType::DATE) )
                    bSuccess = false;
            }
 
            if (!bSuccess)
                GenError( ERRCODE_BASIC_CONVERSION );
 
            bNumber = true;
            eScanType = SbxDOUBLE;
        }
        else
        {
            aError = OUString('#');
            GenError( ERRCODE_BASIC_EXPECTED );
        }
    }
    // invalid characters:
    else if (nLineIdx < aLine.getLength() && aLine[nLineIdx] >= 0x7F)
    {
        GenError( ERRCODE_BASIC_SYNTAX ); nLineIdx++; nCol++;
    }
    // other groups:
    else
    {
        sal_Int32 n = 1;
        auto nChar = nLineIdx < aLine.getLength() ? aLine[nLineIdx] : 0;
        ++nLineIdx;
        if (nLineIdx < aLine.getLength())
        {
            switch (nChar)
            {
                case '<': if( aLine[nLineIdx] == '>' || aLine[nLineIdx] == '=' ) n = 2; break;
                case '>': if( aLine[nLineIdx] == '=' ) n = 2; break;
                case ':': if( aLine[nLineIdx] == '=' ) n = 2; break;
            }
        }
        aSym = aLine.copy(nCol, std::min(n, aLine.getLength() - nCol));
        nLineIdx += n-1; nCol = nCol + n;
    }
 
    nCol2 = nCol-1;
 
PrevLineCommentLbl:
 
    if (bPrevLineExtentsComment ||
        (eScanType != SbxSTRING &&
        (aSym.startsWith("'") || aSym.equalsIgnoreAsciiCase("REM") || aSym.startsWith("#"))))
    {
        bPrevLineExtentsComment = false;
        aSym = "REM";
        sal_Int32 nLen = aLine.getLength() - nLineIdx;
        // tdf#149402 - don't extend comment if line ends in a whitespace (BasicCharClass::isWhitespace)
        if (bCompatible && !bLineEndsWithWhitespace && aLine[nLineIdx + nLen - 1] == '_'
            && aLine[nLineIdx + nLen - 2] == ' ')
            bPrevLineExtentsComment = true;
        nCol2 = nCol2 + nLen;
        nLineIdx = -1;
    }
 
    if (nLineIdx == nLineIdxScanStart)
    {
        GenError( ERRCODE_BASIC_SYMBOL_EXPECTED );
        return false;
    }
 
    return true;
 
 
eoln:
    if (nCol && aLine[--nLineIdx] == '_' && !bClosingUnderscore)
    {
        nLineIdx = -1;
        bool bRes = NextSym();
        if( aSym.startsWith(".") )
        {
            // object _
            //    .Method
            // ^^^  <- spaces is legal in MSO VBA
            bSpaces = false;
        }
        return bRes;
    }
    else
    {
        nLineIdx = -1;
        nLine = nOldLine;
        nCol1 = nOldCol1;
        nCol2 = nOldCol2;
        aSym = "\n";
        nColLock = 0;
        bClosingUnderscore = false;
        // tdf#149157 - break multiline continuation in a comment after a new line
        bPrevLineExtentsComment = false;
        return true;
    }
}
 
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
↑ V530 The return value of function 'append' is required to be utilized.