/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */
 
 
#include <pdfparse.hxx>
 
#include <boost/spirit/include/classic.hpp>
#include <boost/bind/bind.hpp>
 
#include <string.h>
 
#include <o3tl/char16_t2wchar_t.hxx>
#include <o3tl/safeint.hxx>
#include <osl/thread.h>
#include <rtl/strbuf.hxx>
#include <rtl/ustrbuf.hxx>
#include <sal/log.hxx>
#include <utility>
 
 
using namespace boost::spirit::classic;
using namespace pdfparse;
 
namespace {
 
class StringEmitContext : public EmitContext
{
    OStringBuffer m_aBuf;
    public:
    StringEmitContext() :  m_aBuf(256) {}
 
    virtual bool write( const void* pBuf, unsigned int nLen ) noexcept override
    {
        m_aBuf.append( static_cast<const char*>(pBuf), nLen );
        return true;
    }
    virtual unsigned int getCurPos() noexcept override { return m_aBuf.getLength(); }
    virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept override
    { return (nOrigOffset+nLen < o3tl::make_unsigned(m_aBuf.getLength()) ) &&
             write( m_aBuf.getStr() + nOrigOffset, nLen ); }
    virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept override
    {
        if( nOrigOffset+nLen < o3tl::make_unsigned(m_aBuf.getLength()) )
        {
            memcpy( pBuf, m_aBuf.getStr()+nOrigOffset, nLen );
            return nLen;
        }
        return 0;
    }
 
    OString getString() { return m_aBuf.makeStringAndClear(); }
};
 
template< class iteratorT >
class PDFGrammar :  public grammar< PDFGrammar<iteratorT> >
{
public:
 
    explicit PDFGrammar( iteratorT first )
    : m_fDouble( 0.0 ), m_aGlobalBegin(std::move( first )) {}
    ~PDFGrammar()
    {
        if( !m_aObjectStack.empty() )
            delete m_aObjectStack.front();
    }
 
    double m_fDouble;
    std::vector< unsigned int > m_aUIntStack;
    std::vector< PDFEntry* >    m_aObjectStack;
    OString                m_aErrorString;
    iteratorT                   m_aGlobalBegin;
 
public:
    struct pdf_string_parser
    {
        typedef nil_t result_t;
        template <typename ScannerT>
        std::ptrdiff_t
        operator()(ScannerT const& scan, result_t&) const
        {
            std::ptrdiff_t len = 0;
 
            int nBraceLevel = 0;
            while( ! scan.at_end() )
            {
                char c = *scan;
                if( c == ')' )
                {
                    nBraceLevel--;
                    if( nBraceLevel < 0 )
                        break;
                }
                else if( c == '(' )
                    nBraceLevel++;
                else if( c == '\\' ) // ignore escaped braces
                {
                    ++len;
                    ++scan.first;                 // tdf#63054: avoid skipping spaces
                    if( scan.first == scan.last ) // tdf#63054: avoid skipping spaces
                        break;
                }
                ++len;
                ++scan;
            }
            return scan.at_end() ? -1 : len;
        }
    };
 
    template< typename ScannerT >
    struct definition
    {
        explicit definition( const PDFGrammar<iteratorT>& rSelf )
        {
            using namespace boost::placeholders;
 
            PDFGrammar<iteratorT>* pSelf = const_cast< PDFGrammar<iteratorT>* >( &rSelf );
 
            // workaround workshop compiler: comment_p doesn't work
            // comment     = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )];
            comment     = lexeme_d[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p)[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )] ];
 
            boolean     = (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool, pSelf, _1, _2)];
 
            // workaround workshop compiler: confix_p doesn't work
            //stream      = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
            stream      = (str_p("stream") >> *(anychar_p - str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
 
            name        = lexeme_d[
                            ch_p('/')
                            >> (*(anychar_p-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0')))
                               [boost::bind(&PDFGrammar::pushName, pSelf, _1, _2)] ];
 
            // workaround workshop compiler: confix_p doesn't work
            //stringtype  = ( confix_p("(",*anychar_p, ")") |
            //                confix_p("<",*xdigit_p,  ">") )
            //              [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
 
            stringtype  = ( ( ch_p('(') >> functor_parser<pdf_string_parser>() >> ch_p(')') ) |
                            ( ch_p('<') >> *xdigit_p >> ch_p('>') ) )
                          [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
 
            null_object = str_p( "null" )[boost::bind(&PDFGrammar::pushNull, pSelf, _1, _2)];
 
            #ifdef USE_ASSIGN_ACTOR
            objectref   = ( uint_p[push_back_a(pSelf->m_aUIntStack)]
                            >> uint_p[push_back_a(pSelf->m_aUIntStack)]
                            >> ch_p('R')
                            >> eps_p
                          )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
            #else
            objectref   = ( uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
                            >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
                            >> ch_p('R')
                            >> eps_p
                          )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
            #endif
 
            #ifdef USE_ASSIGN_ACTOR
            simple_type = objectref | name |
                          ( real_p[assign_a(pSelf->m_fDouble)] >> eps_p )
                          [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
                          | stringtype | boolean | null_object;
            #else
            simple_type = objectref | name |
                          ( real_p[boost::bind(&PDFGrammar::assign_action_double, pSelf, _1)] >> eps_p )
                          [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
                          | stringtype | boolean | null_object;
            #endif
 
            dict_begin  = str_p( "<<" )[boost::bind(&PDFGrammar::beginDict, pSelf, _1, _2)];
            dict_end    = str_p( ">>" )[boost::bind(&PDFGrammar::endDict, pSelf, _1, _2)];
 
            array_begin = str_p("[")[boost::bind(&PDFGrammar::beginArray,pSelf, _1, _2)];
            array_end   = str_p("]")[boost::bind(&PDFGrammar::endArray,pSelf, _1, _2)];
 
            #ifdef USE_ASSIGN_ACTOR
            object_begin= uint_p[push_back_a(pSelf->m_aUIntStack)]
                          >> uint_p[push_back_a(pSelf->m_aUIntStack)]
                          >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
            #else
            object_begin= uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
                          >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
                          >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
            #endif
            object_end  = str_p( "endobj" )[boost::bind(&PDFGrammar::endObject, pSelf, _1, _2)];
 
            xref        = str_p( "xref" ) >> uint_p >> uint_p
                          >> lexeme_d[
                                +( repeat_p(10)[digit_p]
                                   >> blank_p
                                   >> repeat_p(5)[digit_p]
                                   >> blank_p
                                   >> ( ch_p('n') | ch_p('f') )
                                   >> repeat_p(2)[space_p]
                                 ) ];
 
            dict_element= dict_begin | comment | simple_type
                          | array_begin | array_end | dict_end;
 
            object      = object_begin
                          >> *dict_element
                          >> !stream
                          >> object_end;
 
            trailer     = str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer,pSelf,_1,_2)]
                          >> *dict_element
                          >> str_p("startxref")
                          >> uint_p
                          >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer,pSelf,_1,_2)];
 
            #ifdef USE_ASSIGN_ACTOR
            pdfrule     = ! (lexeme_d[
                                str_p( "%PDF-" )
                                >> uint_p[push_back_a(pSelf->m_aUIntStack)]
                                >> ch_p('.')
                                >> uint_p[push_back_a(pSelf->m_aUIntStack)]
                                >> *((~ch_p('\r') & ~ch_p('\n')))
                                >> eol_p
                             ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
                          >> *( comment | object | ( xref >> trailer ) );
            #else
            pdfrule     = ! (lexeme_d[
                                str_p( "%PDF-" )
                                >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
                                >> ch_p('.')
                                >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
                                >> *(~ch_p('\r') & ~ch_p('\n'))
                                >> eol_p
                             ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
                          >> *( comment | object | ( xref >> trailer ) );
            #endif
        }
        rule< ScannerT > comment, stream, boolean, name, stringtype, null_object, simple_type,
                         objectref, array, value, dict_element, dict_begin, dict_end,
                         array_begin, array_end, object, object_begin, object_end,
                         xref, trailer, pdfrule;
 
        const rule< ScannerT >& start() const { return pdfrule; }
    };
 
    #ifndef USE_ASSIGN_ACTOR
    void push_back_action_uint( unsigned int i )
    {
        m_aUIntStack.push_back( i );
    }
    void assign_action_double( double d )
    {
        m_fDouble = d;
    }
    #endif
 
    [[noreturn]] static void parseError( const char* pMessage, const iteratorT& pLocation )
    {
        throw_( pLocation, pMessage );
    }
 
    OString iteratorToString( iteratorT first, const iteratorT& last ) const
    {
        OStringBuffer aStr( 32 );
        while( first != last )
        {
            aStr.append( *first );
            ++first;
        }
        return aStr.makeStringAndClear();
    }
 
    void haveFile( const iteratorT& pBegin, SAL_UNUSED_PARAMETER iteratorT /*pEnd*/ )
    {
        if( m_aObjectStack.empty() )
        {
            PDFFile* pFile = new PDFFile();
            pFile->m_nMinor = m_aUIntStack.back();
            m_aUIntStack.pop_back();
            pFile->m_nMajor = m_aUIntStack.back();
            m_aUIntStack.pop_back();
            m_aObjectStack.push_back( pFile );
        }
        else
            parseError( "found file header in unusual place", pBegin );
    }
 
    void pushComment(const iteratorT& first, const iteratorT& last)
    {
        // add a comment to the current stack element
        PDFComment* pComment =
            new PDFComment(iteratorToString(first,last));
        if( m_aObjectStack.empty() )
            m_aObjectStack.push_back( new PDFPart() );
        PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
        if( pContainer == nullptr )
            parseError( "comment without container", first );
        pContainer->m_aSubElements.emplace_back( pComment );
    }
 
    void insertNewValue( std::unique_ptr<PDFEntry> pNewValue, const iteratorT& pPos )
    {
        PDFContainer* pContainer = nullptr;
        const char* pMsg = nullptr;
        if( ! m_aObjectStack.empty() )
        {
            pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
            if (pContainer)
            {
                if( dynamic_cast<PDFDict*>(pContainer) == nullptr &&
                    dynamic_cast<PDFArray*>(pContainer) == nullptr )
                {
                    PDFObject* pObj = dynamic_cast<PDFObject*>(pContainer);
                    if( pObj )
                    {
                        if( pObj->m_pObject == nullptr )
                            pObj->m_pObject = pNewValue.get();
                        else
                        {
                            pMsg = "second value for object";
                            pContainer = nullptr;
                        }
                    }
                    else if( dynamic_cast<PDFDict*>(pNewValue.get()) )
                    {
                        PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pContainer);
                        if( pTrailer )
                        {
                            if( pTrailer->m_pDict == nullptr )
                                pTrailer->m_pDict = dynamic_cast<PDFDict*>(pNewValue.get());
                            else
                                pContainer = nullptr;
                        }
                        else
                            pContainer = nullptr;
                    }
                    else
                        pContainer = nullptr;
                }
            }
        }
        if( pContainer )
            pContainer->m_aSubElements.emplace_back( std::move(pNewValue) );
        else
        {
            if( ! pMsg )
            {
                if( dynamic_cast<PDFContainer*>(pNewValue.get()) )
                    pMsg = "array without container";
                else
                    pMsg = "value without container";
            }
            parseError( pMsg, pPos );
        }
    }
 
    void pushName(const iteratorT& first, const iteratorT& last )
    {
        insertNewValue( std::make_unique<PDFName>(iteratorToString(first,last)), first );
    }
 
    void pushDouble( const iteratorT& first, SAL_UNUSED_PARAMETER const iteratorT& /*last*/ )
    {
        insertNewValue( std::make_unique<PDFNumber>(m_fDouble), first );
    }
 
    void pushString( const iteratorT& first, const iteratorT& last )
    {
        insertNewValue( std::make_unique<PDFString>(iteratorToString(first,last)), first );
    }
 
    void pushBool( const iteratorT& first, const iteratorT& last )
    {
        insertNewValue( std::make_unique<PDFBool>( last-first == 4 ), first );
    }
 
    void pushNull( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        insertNewValue( std::make_unique<PDFNull>(), first );
    }
 
    void beginObject( const iteratorT& first, SAL_UNUSED_PARAMETER const iteratorT& /*last*/ )
    {
        if( m_aObjectStack.empty() )
            m_aObjectStack.push_back( new PDFPart() );
 
        unsigned int nGeneration = m_aUIntStack.back();
        m_aUIntStack.pop_back();
        unsigned int nObject = m_aUIntStack.back();
        m_aUIntStack.pop_back();
 
        PDFObject* pObj = new PDFObject( nObject, nGeneration );
        pObj->m_nOffset = first - m_aGlobalBegin;
 
        PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
        if( pContainer &&
            ( dynamic_cast<PDFFile*>(pContainer) ||
              dynamic_cast<PDFPart*>(pContainer) ) )
        {
            pContainer->m_aSubElements.emplace_back( pObj );
            m_aObjectStack.push_back( pObj );
        }
        else
            parseError( "object in wrong place", first );
    }
 
    void endObject( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        if( m_aObjectStack.empty() )
            parseError( "endobj without obj", first );
        else if( dynamic_cast<PDFObject*>(m_aObjectStack.back()) == nullptr )
            parseError( "spurious endobj", first );
        else
            m_aObjectStack.pop_back();
    }
 
    void pushObjectRef( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        unsigned int nGeneration = m_aUIntStack.back();
        m_aUIntStack.pop_back();
        unsigned int nObject = m_aUIntStack.back();
        m_aUIntStack.pop_back();
        insertNewValue( std::make_unique<PDFObjectRef>(nObject,nGeneration), first );
    }
 
    void beginDict( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        PDFDict* pDict = new PDFDict();
        pDict->m_nOffset = first - m_aGlobalBegin;
 
        insertNewValue( std::unique_ptr<PDFEntry>(pDict), first );
        // will not come here if insertion fails (exception)
        m_aObjectStack.push_back( pDict );
    }
 
    void endDict( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        PDFDict* pDict = nullptr;
        if( m_aObjectStack.empty() )
            parseError( "dictionary end without begin", first );
        else if( (pDict = dynamic_cast<PDFDict*>(m_aObjectStack.back())) == nullptr )
            parseError( "spurious dictionary end", first );
        else
            m_aObjectStack.pop_back();
 
        PDFEntry* pOffender = pDict->buildMap();
        if( pOffender )
        {
            StringEmitContext aCtx;
            aCtx.write( "offending dictionary element: ", 30 );
            pOffender->emit( aCtx );
            m_aErrorString = aCtx.getString();
            parseError( m_aErrorString.getStr(), first );
        }
    }
 
    void beginArray( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        PDFArray* pArray = new PDFArray();
        pArray->m_nOffset = first - m_aGlobalBegin;
 
        insertNewValue( std::unique_ptr<PDFEntry>(pArray), first );
        // will not come here if insertion fails (exception)
        m_aObjectStack.push_back( pArray );
    }
 
    void endArray( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        if( m_aObjectStack.empty() )
            parseError( "array end without begin", first );
        else if( dynamic_cast<PDFArray*>(m_aObjectStack.back()) == nullptr )
            parseError( "spurious array end", first );
        else
            m_aObjectStack.pop_back();
    }
 
    void emitStream(const iteratorT& first, const iteratorT& last)
    {
        if( m_aObjectStack.empty() )
            parseError( "stream without object", first );
        PDFObject* pObj = dynamic_cast<PDFObject*>(m_aObjectStack.back());
        if( pObj && pObj->m_pObject )
        {
            if( pObj->m_pStream )
                parseError( "multiple streams in object", first );
 
            PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
            if( pDict )
            {
                PDFStream* pStream = new PDFStream( first - m_aGlobalBegin, last - m_aGlobalBegin, pDict );
 
                pObj->m_pStream = pStream;
                pObj->m_aSubElements.emplace_back( pStream );
            }
        }
        else
            parseError( "stream without object", first );
    }
 
    void beginTrailer( const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        if( m_aObjectStack.empty() )
            m_aObjectStack.push_back( new PDFPart() );
 
        PDFTrailer* pTrailer = new PDFTrailer();
        pTrailer->m_nOffset = first - m_aGlobalBegin;
 
        PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
        if( pContainer &&
            ( dynamic_cast<PDFFile*>(pContainer) ||
              dynamic_cast<PDFPart*>(pContainer) ) )
        {
            pContainer->m_aSubElements.emplace_back( pTrailer );
            m_aObjectStack.push_back( pTrailer );
        }
        else
            parseError( "trailer in wrong place", first );
    }
 
    void endTrailer(const iteratorT& first, SAL_UNUSED_PARAMETER iteratorT )
    {
        if( m_aObjectStack.empty() )
            parseError( "%%EOF without trailer", first );
        else if( dynamic_cast<PDFTrailer*>(m_aObjectStack.back()) == nullptr )
            parseError( "spurious %%EOF", first );
        else
            m_aObjectStack.pop_back();
    }
};
 
}
 
std::unique_ptr<PDFEntry> PDFReader::read(std::u16string_view aFileName)
{
#ifdef _WIN32
    file_iterator<> file_start(std::wstring(o3tl::toW(aFileName)));
#else
    file_iterator<> file_start(
        std::string(OUStringToOString(aFileName, osl_getThreadTextEncoding())));
#endif
    if( ! file_start )
        return nullptr;
    file_iterator<> file_end = file_start.make_end();
    PDFGrammar< file_iterator<> > aGrammar( file_start );
 
    try
    {
#if OSL_DEBUG_LEVEL > 0
        boost::spirit::classic::parse_info< file_iterator<> > aInfo =
#endif
            boost::spirit::classic::parse( file_start,
                                  file_end,
                                  aGrammar,
                                  boost::spirit::classic::space_p );
#if OSL_DEBUG_LEVEL > 0
        SAL_INFO("sdext.pdfimport.pdfparse", "parseinfo: stop at offset = " << aInfo.stop - file_start << ", hit = " << (aInfo.hit ? "true" : "false") << ", full = " << (aInfo.full ? "true" : "false") << ", length = " << aInfo.length);
#endif
    }
    catch( const parser_error< const char*, file_iterator<> >& rError )
    {
        SAL_WARN("sdext.pdfimport.pdfparse", "parse error: " << rError.descriptor << " at buffer pos " << rError.where - file_start);
#if OSL_DEBUG_LEVEL > 0
        OUStringBuffer aTmp;
        unsigned int nElem = aGrammar.m_aObjectStack.size();
        for( unsigned int i = 0; i < nElem; i++ )
        {
            aTmp.append("   ");
            aTmp.appendAscii(typeid( *(aGrammar.m_aObjectStack[i]) ).name());
        }
        SAL_WARN("sdext.pdfimport.pdfparse", "parse error object stack: " << aTmp.makeStringAndClear());
#endif
    }
 
    std::unique_ptr<PDFEntry> pRet;
    unsigned int nEntries = aGrammar.m_aObjectStack.size();
    if( nEntries == 1 )
    {
        pRet.reset(aGrammar.m_aObjectStack.back());
        aGrammar.m_aObjectStack.pop_back();
    }
    else if( nEntries > 1 )
    {
        // It is possible that there are multiple trailers, which is OK.
        // But still keep the warnings, just in case.
        SAL_WARN("sdext.pdfimport.pdfparse", "error got " << nEntries << " stack objects in parse");
        for (;;)
        {
            PDFEntry* pEntry = aGrammar.m_aObjectStack.back();
            aGrammar.m_aObjectStack.pop_back();
            SAL_WARN("sdext.pdfimport.pdfparse", typeid(*pEntry).name());
            PDFObject* pObj = dynamic_cast<PDFObject*>(pEntry);
            if( pObj )
                SAL_WARN("sdext.pdfimport.pdfparse", "   -> object " << pObj->m_nNumber << " generation " << pObj->m_nGeneration);
            if (aGrammar.m_aObjectStack.empty())
            {
                pRet.reset(pEntry); // The first entry references all others - see PDFGrammar dtor
                break;
            }
        }
    }
    return pRet;
}
 
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

V530 The return value of function 'append' is required to be utilized.

V530 The return value of function 'appendAscii' is required to be utilized.