xml2utf.cxx

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 . */ #include <string.h> #include <algorithm> #include <sal/types.h> #include <rtl/textenc.h> #include <rtl/tencinfo.h> #include <com/sun/star/io/NotConnectedException.hpp> #include <com/sun/star/io/XInputStream.hpp> #include <xml2utf.hxx> #include <memory> using namespace ::com::sun::star::uno; using namespace ::com::sun::star::io; namespace sax_expatwrap { sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead ) { if( ! m_in.is() ) { throw NotConnectedException(); } if( ! m_bStarted ) { // it should be possible to find the encoding attribute // within the first 512 bytes == 128 chars in UCS-4 nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead ); } sal_Int32 nRead; Sequence< sal_Int8 > seqStart; while( true ) { nRead = m_in->readSomeBytes( seq , nMaxToRead ); if( nRead + seqStart.getLength()) { // if nRead is 0, the file is already eof. if( ! m_bStarted && nRead ) { // ensure that enough data is available to parse encoding if( seqStart.hasElements() ) { // prefix with what we had so far. sal_Int32 nLength = seq.getLength(); seq.realloc( seqStart.getLength() + nLength ); memmove (seq.getArray() + seqStart.getLength(), seq.getConstArray(), nLength); memcpy (seq.getArray(), seqStart.getConstArray(), seqStart.getLength()); } // autodetection with the first bytes if( ! isEncodingRecognizable( seq ) ) { // remember what we have so far. seqStart = seq; // read more ! continue; } if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) { // initialize decoding initializeDecoding(); } seqStart = Sequence < sal_Int8 > (); } // do the encoding if( m_pText2Unicode && m_pUnicode2Text && m_pText2Unicode->canContinue() ) { Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq ); seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() ); } if( ! m_bStarted ) { // it must now be ensured, that no encoding attribute exist anymore // ( otherwise the expat-Parser will crash ) // This must be done after decoding ! // ( e.g. Files decoded in ucs-4 cannot be read properly ) m_bStarted = true; removeEncoding( seq ); } nRead = seq.getLength(); } break; } return nRead; } void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq ) { const sal_Int8 *pSource = seq.getArray(); if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) return; // scan for encoding OString str( reinterpret_cast<char const *>(pSource), seq.getLength() ); // cut sequence to first line break // find first line break; int nMax = str.indexOf( 10 ); if( nMax >= 0 ) { str = str.copy( 0 , nMax ); } int nFound = str.indexOf( " encoding" ); if( nFound < 0 ) return; int nStop; int nStart = str.indexOf( "\"" , nFound ); if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) { nStart = str.indexOf( "'" , nFound ); nStop = str.indexOf( "'" , nStart +1 ); } else { nStop = str.indexOf( "\"" , nStart +1); } if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) { // remove encoding tag from file memmove( &( seq.getArray()[nFound] ) , &( seq.getArray()[nStop+1]) , seq.getLength() - nStop -1); seq.realloc( seq.getLength() - ( nStop+1 - nFound ) ); } } // Checks, if enough data has been accumulated to recognize the encoding bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq) { const sal_Int8 *pSource = seq.getConstArray(); bool bCheckIfFirstClosingBracketExists = false; if( seq.getLength() < 8 ) { // no recognition possible, when less than 8 bytes are available return false; } if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) { // scan if the <?xml tag finishes within this buffer bCheckIfFirstClosingBracketExists = true; } else if( ('<' == pSource[0] || '<' == pSource[2] ) && ('?' == pSource[4] || '?' == pSource[6] ) ) { // check for utf-16 bCheckIfFirstClosingBracketExists = true; } else if( ( '<' == pSource[1] || '<' == pSource[3] ) && ( '?' == pSource[5] || '?' == pSource[7] ) ) { // check for bCheckIfFirstClosingBracketExists = true; } if( bCheckIfFirstClosingBracketExists ) { // whole <?xml tag is valid return std::find(seq.begin(), seq.end(), '>') != seq.end(); } // No <? tag in front, no need for a bigger buffer return true; } bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq ) { const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() ); bool bReturn = true; if( seq.getLength() < 4 ) { // no recognition possible, when less than 4 bytes are available return false; } // first level : detect possible file formats if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) { // scan for encoding OString str( reinterpret_cast<const char *>(pSource), seq.getLength() ); // cut sequence to first line break //find first line break; int nMax = str.indexOf( 10 ); if( nMax >= 0 ) { str = str.copy( 0 , nMax ); } int nFound = str.indexOf( " encoding" ); if( nFound >= 0 ) { int nStop; int nStart = str.indexOf( "\"" , nFound ); if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) { nStart = str.indexOf( "'" , nFound ); nStop = str.indexOf( "'" , nStart +1 ); } else { nStop = str.indexOf( "\"" , nStart +1); } if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) { // encoding found finally m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 ); } } } else if( 0xFE == pSource[0] && 0xFF == pSource[1] ) { // UTF-16 big endian // conversion is done so that encoding information can be easily extracted m_sEncoding = "utf-16"_ostr; } else if( 0xFF == pSource[0] && 0xFE == pSource[1] ) { // UTF-16 little endian // conversion is done so that encoding information can be easily extracted m_sEncoding = "utf-16"_ostr; } else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) { // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.) // The byte order mark is simply added // simply add the byte order mark ! seq.realloc( seq.getLength() + 2 ); memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE; reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF; m_sEncoding = "utf-16"_ostr; } else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) { // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.) // The byte order mark is simply added seq.realloc( seq.getLength() + 2 ); memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF; reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE; m_sEncoding = "utf-16"_ostr; } else if( 0xEF == pSource[0] && 0xBB == pSource[1] && 0xBF == pSource[2] ) { // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order // The BOM is removed. memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 ); seq.realloc( seq.getLength() - 3 ); m_sEncoding = "utf-8"_ostr; } else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) { // UCS-4 big endian m_sEncoding = "ucs-4"_ostr; } else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) { // UCS-4 little endian m_sEncoding = "ucs-4"_ostr; } /* TODO: no need to test for the moment since we return sal_False like default case anyway else if( 0x4c == pSource[0] && 0x6f == pSource[1] && 0xa7 == static_cast<unsigned char> (pSource[2]) && 0x94 == static_cast<unsigned char> (pSource[3]) ) { // EBCDIC bReturn = sal_False; // must be extended } */ else { // other // UTF8 is directly recognized by the parser. bReturn = false; } return bReturn; } void XMLFile2UTFConverter::initializeDecoding() { if( !m_sEncoding.isEmpty() ) { rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() ); if( encoding != RTL_TEXTENCODING_UTF8 ) { m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding ); m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 ); } } } // Text2UnicodeConverter Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding ) : m_convText2Unicode(nullptr) , m_contextText2Unicode(nullptr) { rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() ); if( RTL_TEXTENCODING_DONTKNOW == encoding ) { m_bCanContinue = false; m_bInitialized = false; } else { init( encoding ); } } Text2UnicodeConverter::~Text2UnicodeConverter() { if( m_bInitialized ) { rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode ); rtl_destroyUnicodeToTextConverter( m_convText2Unicode ); } } void Text2UnicodeConverter::init( rtl_TextEncoding encoding ) { m_bCanContinue = true; m_bInitialized = true; m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding); m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode ); } Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText ) { sal_uInt32 uiInfo; sal_Size nSrcCvtBytes = 0; sal_Size nTargetCount = 0; sal_Size nSourceCount = 0; // the whole source size sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength(); Sequence<sal_Unicode> seqUnicode ( nSourceSize ); const sal_Int8 *pbSource = seqText.getConstArray(); std::unique_ptr<sal_Int8[]> pbTempMem; if( m_seqSource.hasElements() ) { // put old rest and new byte sequence into one array pbTempMem.reset(new sal_Int8[ nSourceSize ]); memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() ); memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() ); pbSource = pbTempMem.get(); // set to zero again m_seqSource = Sequence< sal_Int8 >(); } while( true ) { /* All invalid characters are transformed to the unicode undefined char */ nTargetCount += rtl_convertTextToUnicode( m_convText2Unicode, m_contextText2Unicode, reinterpret_cast<const char *>(&( pbSource[nSourceCount] )), nSourceSize - nSourceCount , &( seqUnicode.getArray()[ nTargetCount ] ), seqUnicode.getLength() - nTargetCount, RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT, &uiInfo, &nSrcCvtBytes ); nSourceCount += nSrcCvtBytes; if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) { // save necessary bytes for next conversion seqUnicode.realloc( seqUnicode.getLength() * 2 ); continue; } break; } if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) { m_seqSource.realloc( nSourceSize - nSourceCount ); memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount ); } // set to correct unicode size seqUnicode.realloc( nTargetCount ); return seqUnicode; } // Unicode2TextConverter Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding ) { m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding ); m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text ); } Unicode2TextConverter::~Unicode2TextConverter() { rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text ); rtl_destroyUnicodeToTextConverter( m_convUnicode2Text ); } Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize) { std::unique_ptr<sal_Unicode[]> puTempMem; if( m_seqSource.hasElements() ) { // For surrogates ! // put old rest and new byte sequence into one array // In general when surrogates are used, they should be rarely // cut off between two convert()-calls. So this code is used // rarely and the extra copy is acceptable. puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]); memcpy( puTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() * sizeof( sal_Unicode ) ); memcpy( &(puTempMem[ m_seqSource.getLength() ]) , puSource , nSourceSize*sizeof( sal_Unicode ) ); puSource = puTempMem.get(); nSourceSize += m_seqSource.getLength(); m_seqSource = Sequence< sal_Unicode > (); } sal_Size nTargetCount = 0; sal_Size nSourceCount = 0; sal_uInt32 uiInfo; sal_Size nSrcCvtChars; // take nSourceSize * 3 as preference // this is an upper boundary for converting to utf8, // which most often used as the target. sal_Int32 nSeqSize = nSourceSize * 3; Sequence<sal_Int8> seqText( nSeqSize ); char *pTarget = reinterpret_cast<char *>(seqText.getArray()); while( true ) { nTargetCount += rtl_convertUnicodeToText( m_convUnicode2Text, m_contextUnicode2Text, &( puSource[nSourceCount] ), nSourceSize - nSourceCount , &( pTarget[nTargetCount] ), nSeqSize - nTargetCount, RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT | RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT , &uiInfo, &nSrcCvtChars); nSourceCount += nSrcCvtChars; if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) { nSeqSize = nSeqSize *2; seqText.realloc( nSeqSize ); // double array size pTarget = reinterpret_cast<char *>(seqText.getArray()); continue; } break; } // for surrogates if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) { m_seqSource.realloc( nSourceSize - nSourceCount ); memcpy( m_seqSource.getArray() , &(puSource[nSourceCount]), (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) ); } // reduce the size of the buffer (fast, no copy necessary) seqText.realloc( nTargetCount ); return seqText; } } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

↑ V793 It is odd that the result of the 'nRead + seqStart.getLength()' statement is a part of the condition. Perhaps, this statement should have been compared with something else.