svparser.cxx


/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */
 
#include <svtools/svparser.hxx>
#include <svtools/htmltokn.h>
#include <tools/stream.hxx>
#include <tools/debug.hxx>
#include <rtl/textcvt.h>
#include <rtl/tencinfo.h>
#include <rtl/character.hxx>
#include <sal/log.hxx>
#include <unicode/ucsdet.h>
#include <comphelper/configuration.hxx>
 
#include <vector>
 
// structure to store the actual data
template<typename T>
struct SvParser_Impl
{
    OUString        aToken;             // parsed token
    sal_uInt64      nFilePos;           // actual position in stream
    sal_uInt32      nlLineNr;           // actual line number
    sal_uInt32      nlLinePos;          // actual column number
    tools::Long            nTokenValue;        // extra value (RTF)
    bool            bTokenHasValue;     // indicates whether nTokenValue is valid
    T               nToken;             // actual Token
    sal_uInt32      nNextCh;            // actual character
    T               nSaveToken;         // the token from Continue
 
    rtl_TextToUnicodeConverter hConv;
    rtl_TextToUnicodeContext   hContext;
 
    SvParser_Impl()
        : nFilePos(0)
        , nlLineNr(0)
        , nlLinePos(0)
        , nTokenValue(0)
        , bTokenHasValue(false)
        , nToken(static_cast<T>(0))
        , nNextCh(0)
        , nSaveToken(static_cast<T>(0))
        , hConv( nullptr )
        , hContext( reinterpret_cast<rtl_TextToUnicodeContext>(1) )
    {
    }
 
};
 
 
template<typename T>
SvParser<T>::TokenStackType::TokenStackType()
  : nTokenValue(0)
    , bTokenHasValue(false)
    , nTokenId(static_cast<T>(0))
{
}
 
// Constructor
template<typename T>
SvParser<T>::SvParser( SvStream& rIn, sal_uInt8 nStackSize )
    : rInput( rIn )
    , nlLineNr( 1 )
    , nlLinePos( 1 )
    , nConversionErrors( 0 )
    , pImplData( nullptr )
    , m_nTokenIndex(0)
    , nTokenValue( 0 )
    , bTokenHasValue( false )
    , bFuzzing(comphelper::IsFuzzing())
    , eState( SvParserState::NotStarted )
    , eSrcEnc( RTL_TEXTENCODING_DONTKNOW )
    , nNextChPos(0)
    , nNextCh(0)
    , bSwitchToUCS2(false)
    , bRTF_InTextRead(false)
    , nTokenStackSize( nStackSize )
    , nTokenStackPos( 0 )
{
    eState = SvParserState::NotStarted;
    if( nTokenStackSize < 3 )
        nTokenStackSize = 3;
    pTokenStack.reset(new TokenStackType[ nTokenStackSize ]);
    pTokenStackPos = pTokenStack.get();
}
 
template<typename T>
SvParser<T>::~SvParser()
{
    if( pImplData && pImplData->hConv )
    {
        rtl_destroyTextToUnicodeContext( pImplData->hConv,
                                         pImplData->hContext );
        rtl_destroyTextToUnicodeConverter( pImplData->hConv );
    }
 
    pTokenStack.reset();
}
 
template<typename T> SvParserState SvParser<T>::GetStatus() const { return eState; }
template<typename T> sal_uInt32 SvParser<T>::GetLineNr() const       { return nlLineNr; }
template<typename T> sal_uInt32 SvParser<T>::GetLinePos() const      { return nlLinePos; }
template<typename T> void       SvParser<T>::IncLineNr()             { ++nlLineNr; }
template<typename T> sal_uInt32 SvParser<T>::IncLinePos()            { return ++nlLinePos; }
template<typename T> void       SvParser<T>::SetLineNr( sal_uInt32 nlNum ) { nlLineNr = nlNum; }
template<typename T> void       SvParser<T>::SetLinePos( sal_uInt32 nlPos ) {   nlLinePos = nlPos; }
template<typename T> bool       SvParser<T>::IsParserWorking() const { return SvParserState::Working == eState; }
template<typename T> rtl_TextEncoding SvParser<T>::GetSrcEncoding() const { return eSrcEnc; }
template<typename T> void       SvParser<T>::SetSwitchToUCS2( bool bSet ) { bSwitchToUCS2 = bSet; }
template<typename T> bool       SvParser<T>::IsSwitchToUCS2() const { return bSwitchToUCS2; }
template<typename T> sal_uInt16 SvParser<T>::GetCharSize() const { return (RTL_TEXTENCODING_UCS2 == eSrcEnc) ? 2 : 1; }
template<typename T> Link<LinkParamNone*,void> SvParser<T>::GetAsynchCallLink() const
{
    return LINK( const_cast<SvParser*>(this), SvParser, NewDataRead );
}
 
template<typename T>
void SvParser<T>::ClearTxtConvContext()
{
    if( pImplData && pImplData->hConv )
        rtl_resetTextToUnicodeContext( pImplData->hConv, pImplData->hContext );
}
 
template<typename T>
void SvParser<T>::SetSrcEncoding( rtl_TextEncoding eEnc )
{
    if( eEnc == eSrcEnc )
        return;
 
    if( pImplData && pImplData->hConv )
    {
        rtl_destroyTextToUnicodeContext( pImplData->hConv,
                                         pImplData->hContext );
        rtl_destroyTextToUnicodeConverter( pImplData->hConv );
        pImplData->hConv = nullptr;
        pImplData->hContext = reinterpret_cast<rtl_TextToUnicodeContext>(1);
    }
 
    if( rtl_isOctetTextEncoding(eEnc) ||
        RTL_TEXTENCODING_UCS2 == eEnc  )
    {
        eSrcEnc = eEnc;
        if( !pImplData )
            pImplData.reset(new SvParser_Impl<T>);
        pImplData->hConv = rtl_createTextToUnicodeConverter( eSrcEnc );
        DBG_ASSERT( pImplData->hConv,
                    "SvParser::SetSrcEncoding: no converter for source encoding" );
        if( !pImplData->hConv )
            eSrcEnc = RTL_TEXTENCODING_DONTKNOW;
        else
            pImplData->hContext =
                rtl_createTextToUnicodeContext( pImplData->hConv );
    }
    else
    {
        SAL_WARN( "svtools",
                    "SvParser::SetSrcEncoding: invalid source encoding" );
        eSrcEnc = RTL_TEXTENCODING_DONTKNOW;
    }
}
 
template<typename T>
void SvParser<T>::RereadLookahead()
{
    rInput.Seek(nNextChPos);
    nNextCh = GetNextChar();
}
 
template<typename T>
sal_uInt32 SvParser<T>::GetNextChar()
{
    sal_uInt32 c = 0U;
 
    // When reading multiple bytes, we don't have to care about the file
    // position when we run into the pending state. The file position is
    // maintained by SaveState/RestoreState.
    if( bSwitchToUCS2 && 0 == rInput.Tell() )
    {
        rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW);
        if (rInput.good())
        {
            sal_uInt64 nPos = rInput.Tell();
            if (nPos == 2)
                eSrcEnc = RTL_TEXTENCODING_UCS2;
            else if (nPos == 3)
                SetSrcEncoding(RTL_TEXTENCODING_UTF8);
            else // Try to detect encoding without BOM
            {
                std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer
                const size_t nSize = rInput.ReadBytes(buf.data(), buf.size());
                rInput.Seek(0);
                if (nSize > 0)
                {
                    UErrorCode uerr = U_ZERO_ERROR;
                    UCharsetDetector* ucd = ucsdet_open(&uerr);
                    ucsdet_setText(ucd, buf.data(), nSize, &uerr);
                    if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr))
                    {
                        const char* pEncodingName = ucsdet_getName(match, &uerr);
 
                        if (U_SUCCESS(uerr))
                        {
                            if (strcmp("UTF-8", pEncodingName) == 0)
                            {
                                SetSrcEncoding(RTL_TEXTENCODING_UTF8);
                            }
                            else if (strcmp("UTF-16LE", pEncodingName) == 0)
                            {
                                eSrcEnc = RTL_TEXTENCODING_UCS2;
                                rInput.SetEndian(SvStreamEndian::LITTLE);
                            }
                            else if (strcmp("UTF-16BE", pEncodingName) == 0)
                            {
                                eSrcEnc = RTL_TEXTENCODING_UCS2;
                                rInput.SetEndian(SvStreamEndian::BIG);
                            }
                        }
                    }
 
                    ucsdet_close(ucd);
                }
            }
        }
        bSwitchToUCS2 = false;
    }
 
    bool bErr;
    nNextChPos = rInput.Tell();
 
    if( RTL_TEXTENCODING_UCS2 == eSrcEnc )
    {
        sal_Unicode cUC;
        rInput.ReadUtf16(cUC);
        bErr = !rInput.good();
        if( !bErr )
        {
            c = cUC;
            if (rtl::isHighSurrogate(cUC))
            {
                const sal_uInt64 nPos = rInput.Tell();
                rInput.ReadUtf16(cUC);
                if (rtl::isLowSurrogate(cUC)) // can only be true when ReadUtf16 succeeded
                    c = rtl::combineSurrogates(c, cUC);
                else
                    rInput.Seek(nPos); // process lone high surrogate
            }
        }
    }
    else
    {
        sal_Size nChars = 0;
        do
        {
            char c1;    // signed, that's the text converter expects
            rInput.ReadChar( c1 );
            bErr = !rInput.good();
            if( !bErr )
            {
                if (
                     RTL_TEXTENCODING_DONTKNOW == eSrcEnc ||
                     RTL_TEXTENCODING_SYMBOL == eSrcEnc
                   )
                {
                    // no conversion shall take place
                    c = reinterpret_cast<unsigned char&>( c1 );
                    nChars = 1;
                }
                else
                {
                    assert(pImplData && pImplData->hConv && "no text converter!");
 
                    sal_Unicode cUC;
                    sal_uInt32 nInfo = 0;
                    sal_Size nCvtBytes;
                    nChars = rtl_convertTextToUnicode(
                                pImplData->hConv, pImplData->hContext,
                                &c1, 1, &cUC, 1,
                                RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
                                RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
                                RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
                                &nInfo, &nCvtBytes);
                    if( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 )
                    {
                        // The conversion wasn't successful because we haven't
                        // read enough characters.
                        if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) )
                        {
                            sal_Unicode sCh[2];
                            while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 )
                            {
                                rInput.ReadChar( c1 );
                                bErr = !rInput.good();
                                if( bErr )
                                    break;
 
                                nChars = rtl_convertTextToUnicode(
                                            pImplData->hConv, pImplData->hContext,
                                            &c1, 1, sCh , 2,
                                            RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
                                            RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
                                            RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
                                            &nInfo, &nCvtBytes);
                            }
                            if( !bErr )
                            {
                                if( 1 == nChars && 0 == nInfo )
                                {
                                    c = sal_uInt32( sCh[0] );
                                }
                                else if( 2 == nChars && 0 == nInfo )
                                {
                                    c = rtl::combineSurrogates( sCh[0], sCh[1] );
                                }
                                else if( 0 != nChars || 0 != nInfo )
                                {
                                    DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0,
                                        "source buffer is too small" );
                                    DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0,
                                         "there is a conversion error" );
                                    DBG_ASSERT( 0 == nChars,
                                       "there is a converted character, but an error" );
                                    // There are still errors, but nothing we can
                                    // do
                                    c = '?';
                                    nChars = 1;
                                    ++nConversionErrors;
                                }
                            }
                        }
                        else
                        {
                            char sBuffer[10];
                            sBuffer[0] = c1;
                            sal_uInt16 nLen = 1;
                            while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 &&
                                    nLen < 10 )
                            {
                                rInput.ReadChar( c1 );
                                bErr = !rInput.good();
                                if( bErr )
                                    break;
 
                                sBuffer[nLen++] = c1;
                                nChars = rtl_convertTextToUnicode(
                                            pImplData->hConv, nullptr, sBuffer, nLen, &cUC, 1,
                                            RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
                                            RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
                                            RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
                                            &nInfo, &nCvtBytes);
                            }
                            if( !bErr )
                            {
                                if( 1 == nChars && 0 == nInfo )
                                {
                                    DBG_ASSERT( nCvtBytes == nLen,
                                                "no all bytes have been converted!" );
                                    c = cUC;
                                }
                                else
                                {
                                    DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0,
                                        "source buffer is too small" );
                                    DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0,
                                         "there is a conversion error" );
                                    DBG_ASSERT( 0 == nChars,
                                       "there is a converted character, but an error" );
 
                                    // There are still errors, so we use the first
                                    // character and restart after that.
                                    c = reinterpret_cast<unsigned char&>( sBuffer[0] );
                                    rInput.SeekRel( -(nLen-1) );
                                    nChars = 1;
                                    ++nConversionErrors;
                                }
                            }
                        }
                    }
                    else if( 1 == nChars && 0 == nInfo )
                    {
                        // The conversion was successful
                        DBG_ASSERT( nCvtBytes == 1,
                                    "no all bytes have been converted!" );
                        c = cUC;
                    }
                    else if( 0 != nChars || 0 != nInfo )
                    {
                        DBG_ASSERT( 0 == nChars,
                                "there is a converted character, but an error" );
                        DBG_ASSERT( 0 != nInfo,
                                "there is no converted character and no error" );
                        // #73398#: If the character could not be converted,
                        // because a conversion is not available, do no conversion at all.
                        c = reinterpret_cast<unsigned char&>( c1 );
                        nChars = 1;
                        ++nConversionErrors;
                    }
                }
            }
        }
        while( 0 == nChars  && !bErr );
    }
 
    if ( ! rtl::isUnicodeScalarValue( c ) )
        c = '?' ;
 
    if (bFuzzing && nConversionErrors > 128)
    {
        SAL_WARN("svtools", "SvParser::GetNextChar too many conversion errors while fuzzing, abandoning for performance");
        bErr = true;
    }
 
    if( bErr )
    {
        if( ERRCODE_IO_PENDING == rInput.GetError() )
        {
            eState = SvParserState::Pending;
            return c;
        }
        else
            return sal_Unicode(EOF);
    }
 
    if( c == '\n' )
    {
        IncLineNr();
        SetLinePos( 1 );
    }
    else
        IncLinePos();
 
    return c;
}
 
template<typename T>
T SvParser<T>::GetNextToken()
{
    T nRet = static_cast<T>(0);
 
    if( !nTokenStackPos )
    {
        aToken.setLength( 0 );     // empty token buffer
        nTokenValue = -1;   // marker for no value read
        bTokenHasValue = false;
 
        nRet = GetNextToken_();
        if( SvParserState::Pending == eState )
            return nRet;
    }
 
    ++pTokenStackPos;
    if( pTokenStackPos == pTokenStack.get() + nTokenStackSize )
        pTokenStackPos = pTokenStack.get();
 
    // pop from stack ??
    if( nTokenStackPos )
    {
        --nTokenStackPos;
        nTokenValue = pTokenStackPos->nTokenValue;
        bTokenHasValue = pTokenStackPos->bTokenHasValue;
        aToken = pTokenStackPos->sToken;
        nRet = pTokenStackPos->nTokenId;
        ++m_nTokenIndex;
    }
    // no, now push actual value on stack
    else if( SvParserState::Working == eState )
    {
        pTokenStackPos->sToken = aToken;
        pTokenStackPos->nTokenValue = nTokenValue;
        pTokenStackPos->bTokenHasValue = bTokenHasValue;
        pTokenStackPos->nTokenId = nRet;
        ++m_nTokenIndex;
    }
    else if( SvParserState::Accepted != eState && SvParserState::Pending != eState )
        eState = SvParserState::Error;       // an error occurred
 
    return nRet;
}
 
template<typename T>
T SvParser<T>::SkipToken( short nCnt )       // "skip" n Tokens backward
{
    pTokenStackPos = GetStackPtr( nCnt );
    short nTmp = nTokenStackPos - nCnt;
    if( nTmp < 0 )
        nTmp = 0;
    else if( nTmp > nTokenStackSize )
        nTmp = nTokenStackSize;
    nTokenStackPos = sal_uInt8(nTmp);
 
    m_nTokenIndex -= nTmp;
 
    // restore values
    aToken = pTokenStackPos->sToken;
    nTokenValue = pTokenStackPos->nTokenValue;
    bTokenHasValue = pTokenStackPos->bTokenHasValue;
 
    return pTokenStackPos->nTokenId;
}
 
template<typename T>
typename SvParser<T>::TokenStackType* SvParser<T>::GetStackPtr( short nCnt )
{
    sal_uInt8 nCurrentPos = sal_uInt8(pTokenStackPos - pTokenStack.get());
    if( nCnt > 0 )
    {
        if( nCnt >= nTokenStackSize )
            nCnt = (nTokenStackSize-1);
        if( nCurrentPos + nCnt < nTokenStackSize )
            nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt);
        else
            nCurrentPos = sal::static_int_cast< sal_uInt8 >(
                nCurrentPos + (nCnt - nTokenStackSize));
    }
    else if( nCnt < 0 )
    {
        if( -nCnt >= nTokenStackSize )
            nCnt = -nTokenStackSize+1;
        if( -nCnt <= nCurrentPos )
            nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt);
        else
            nCurrentPos = sal::static_int_cast< sal_uInt8 >(
                nCurrentPos + (nCnt + nTokenStackSize));
    }
    return pTokenStack.get() + nCurrentPos;
}
 
// to read asynchronous from SvStream
 
template<typename T>
T SvParser<T>::GetSaveToken() const
{
    return pImplData ? pImplData->nSaveToken : static_cast<T>(0);
}
 
template<typename T>
void SvParser<T>::SaveState( T nToken )
{
    // save actual status
    if( !pImplData )
    {
        pImplData.reset(new SvParser_Impl<T>);
        pImplData->nSaveToken = static_cast<T>(0);
    }
 
    pImplData->nFilePos = rInput.Tell();
    pImplData->nToken = nToken;
 
    pImplData->aToken = aToken;
    pImplData->nlLineNr = nlLineNr;
    pImplData->nlLinePos = nlLinePos;
    pImplData->nTokenValue= nTokenValue;
    pImplData->bTokenHasValue = bTokenHasValue;
    pImplData->nNextCh = nNextCh;
}
 
template<typename T>
void SvParser<T>::RestoreState()
{
    // restore old status
    if( !pImplData )
        return;
 
    if( ERRCODE_IO_PENDING == rInput.GetError() )
        rInput.ResetError();
    aToken = pImplData->aToken;
    nlLineNr = pImplData->nlLineNr;
    nlLinePos = pImplData->nlLinePos;
    nTokenValue= pImplData->nTokenValue;
    bTokenHasValue=pImplData->bTokenHasValue;
    nNextCh = pImplData->nNextCh;
 
    pImplData->nSaveToken = pImplData->nToken;
 
    rInput.Seek( pImplData->nFilePos );
}
 
template<typename T>
void SvParser<T>::Continue( T )
{
}
 
 
// expanded out version of
//   IMPL_LINK_NOARG( SvParser, NewDataRead, LinkParamNone*, void )
// since it can't cope with template methods
template<typename T>
void SvParser<T>::LinkStubNewDataRead(void * instance, LinkParamNone* data) {
    return static_cast<SvParser<T> *>(instance)->NewDataRead(data);
}
template<typename T>
void SvParser<T>::NewDataRead(SAL_UNUSED_PARAMETER LinkParamNone*)
{
    switch( eState )
    {
    case SvParserState::Pending:
        eState = SvParserState::Working;
        RestoreState();
 
        Continue( pImplData->nToken );
 
        if( ERRCODE_IO_PENDING == rInput.GetError() )
            rInput.ResetError();
 
        if( SvParserState::Pending != eState )
            ReleaseRef();                    // ready otherwise!
        break;
 
    case SvParserState::NotStarted:
    case SvParserState::Working:
        break;
 
    default:
        ReleaseRef();                    // ready otherwise!
        break;
    }
}
 
template class SVT_DLLPUBLIC SvParser<int>;
template class SVT_DLLPUBLIC SvParser<HtmlTokenId>;
 
/*========================================================================
 *
 * SvKeyValueIterator.
 *
 *======================================================================*/
 
typedef std::vector<SvKeyValue> SvKeyValueList_Impl;
 
struct SvKeyValueIterator::Impl
{
    SvKeyValueList_Impl maList;
    sal_uInt16 mnPos;
 
    Impl() : mnPos(0) {}
};
 
SvKeyValueIterator::SvKeyValueIterator() : mpImpl(new Impl) {}
 
SvKeyValueIterator::~SvKeyValueIterator() = default;
 
bool SvKeyValueIterator::GetFirst (SvKeyValue &rKeyVal)
{
    mpImpl->mnPos = mpImpl->maList.size();
    return GetNext (rKeyVal);
}
 
bool SvKeyValueIterator::GetNext (SvKeyValue &rKeyVal)
{
    if (mpImpl->mnPos > 0)
    {
        rKeyVal = mpImpl->maList[--mpImpl->mnPos];
        return true;
    }
    else
    {
        // Nothing to do.
        return false;
    }
}
 
void SvKeyValueIterator::Append (const SvKeyValue &rKeyVal)
{
    mpImpl->maList.push_back(rKeyVal);
}
 
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
↑ V1048 The 'eState' variable was assigned the same value.