/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
#include <com/sun/star/uno/Reference.h>
#include <comphelper/sequence.hxx>
#include <comphelper/processfactory.hxx>
#include <cppuhelper/factory.hxx>
#include <cppuhelper/supportsservice.hxx>
#include <cppuhelper/weak.hxx>
#include <com/sun/star/linguistic2/XLinguProperties.hpp>
#include <com/sun/star/linguistic2/LinguServiceManager.hpp>
#include <com/sun/star/linguistic2/XSpellChecker1.hpp>
#include <i18nlangtag/languagetag.hxx>
#include <tools/debug.hxx>
#include <osl/mutex.hxx>
#include <osl/thread.h>
#include <hyphen.h>
#include "hyphenimp.hxx"
#include <linguistic/hyphdta.hxx>
#include <rtl/ustring.hxx>
#include <rtl/ustrbuf.hxx>
#include <rtl/textenc.h>
#include <sal/log.hxx>
#include <linguistic/misc.hxx>
#include <svtools/strings.hrc>
#include <unotools/charclass.hxx>
#include <unotools/lingucfg.hxx>
#include <unotools/resmgr.hxx>
#include <osl/file.hxx>
#include <stdio.h>
#include <string.h>
#include <cassert>
#include <numeric>
#include <vector>
#include <set>
#include <memory>
#include <o3tl/string_view.hxx>
// XML-header to query SPELLML support
constexpr OUStringLiteral SPELLML_SUPPORT = u"<?xml?>";
using namespace osl;
using namespace com::sun::star;
using namespace com::sun::star::beans;
using namespace com::sun::star::lang;
using namespace com::sun::star::uno;
using namespace com::sun::star::linguistic2;
using namespace linguistic;
static uno::Reference< XLinguServiceManager2 > GetLngSvcMgr_Impl()
{
const uno::Reference< XComponentContext >& xContext( comphelper::getProcessComponentContext() );
uno::Reference< XLinguServiceManager2 > xRes = LinguServiceManager::create( xContext ) ;
return xRes;
}
Hyphenator::Hyphenator() :
aEvtListeners ( GetLinguMutex() )
{
bDisposing = false;
}
Hyphenator::~Hyphenator()
{
for (auto & rInfo : mvDicts)
{
if (rInfo.aPtr)
hnj_hyphen_free(rInfo.aPtr);
}
if (pPropHelper)
{
pPropHelper->RemoveAsPropListener();
}
}
PropertyHelper_Hyphenation& Hyphenator::GetPropHelper_Impl()
{
if (!pPropHelper)
{
Reference< XLinguProperties > xPropSet = GetLinguProperties();
pPropHelper.reset( new PropertyHelper_Hyphenation (static_cast<XHyphenator *>(this), xPropSet ) );
pPropHelper->AddAsPropListener(); //! after a reference is established
}
return *pPropHelper;
}
Sequence< Locale > SAL_CALL Hyphenator::getLocales()
{
MutexGuard aGuard( GetLinguMutex() );
// this routine should return the locales supported by the installed
// dictionaries.
if (mvDicts.empty())
{
SvtLinguConfig aLinguCfg;
// get list of dictionaries-to-use
// (or better speaking: the list of dictionaries using the
// new configuration entries).
std::vector< SvtLinguConfigDictionaryEntry > aDics;
uno::Sequence< OUString > aFormatList;
aLinguCfg.GetSupportedDictionaryFormatsFor( u"Hyphenators"_ustr,
u"org.openoffice.lingu.LibHnjHyphenator"_ustr, aFormatList );
for (const auto& rFormat : aFormatList)
{
std::vector< SvtLinguConfigDictionaryEntry > aTmpDic(
aLinguCfg.GetActiveDictionariesByFormat( rFormat ) );
aDics.insert( aDics.end(), aTmpDic.begin(), aTmpDic.end() );
}
//!! for compatibility with old dictionaries (the ones not using extensions
//!! or new configuration entries, but still using the dictionary.lst file)
//!! Get the list of old style spell checking dictionaries to use...
std::vector< SvtLinguConfigDictionaryEntry > aOldStyleDics(
GetOldStyleDics( "HYPH" ) );
// to prefer dictionaries with configuration entries we will only
// use those old style dictionaries that add a language that
// is not yet supported by the list of new style dictionaries
MergeNewStyleDicsAndOldStyleDics( aDics, aOldStyleDics );
if (!aDics.empty())
{
// get supported locales from the dictionaries-to-use...
std::set<OUString> aLocaleNamesSet;
for (auto const& dict : aDics)
{
for (const auto& rLocaleName : dict.aLocaleNames)
{
aLocaleNamesSet.insert( rLocaleName );
}
}
// ... and add them to the resulting sequence
std::vector<Locale> aLocalesVec;
aLocalesVec.reserve(aLocaleNamesSet.size());
std::transform(aLocaleNamesSet.begin(), aLocaleNamesSet.end(), std::back_inserter(aLocalesVec),
[](const OUString& localeName) { return LanguageTag::convertToLocale(localeName); });
aSuppLocales = comphelper::containerToSequence(aLocalesVec);
//! For each dictionary and each locale we need a separate entry.
//! If this results in more than one dictionary per locale than (for now)
//! it is undefined which dictionary gets used.
//! In the future the implementation should support using several dictionaries
//! for one locale.
sal_Int32 numdict = std::accumulate(aDics.begin(), aDics.end(), 0,
[](const sal_Int32 nSum, const SvtLinguConfigDictionaryEntry& dict) {
return nSum + dict.aLocaleNames.getLength(); });
// add dictionary information
mvDicts.resize(numdict);
sal_Int32 k = 0;
for (auto const& dict : aDics)
{
if (dict.aLocaleNames.hasElements() &&
dict.aLocations.hasElements())
{
// currently only one language per dictionary is supported in the actual implementation...
// Thus here we work-around this by adding the same dictionary several times.
// Once for each of its supported locales.
for (const auto& rLocaleName : dict.aLocaleNames)
{
LanguageTag aLanguageTag(rLocaleName);
mvDicts[k].aPtr = nullptr;
mvDicts[k].eEnc = RTL_TEXTENCODING_DONTKNOW;
mvDicts[k].aLoc = aLanguageTag.getLocale();
mvDicts[k].apCC.reset( new CharClass( std::move(aLanguageTag) ) );
// also both files have to be in the same directory and the
// file names must only differ in the extension (.aff/.dic).
// Thus we use the first location only and strip the extension part.
OUString aLocation = dict.aLocations[0];
sal_Int32 nPos = aLocation.lastIndexOf( '.' );
aLocation = aLocation.copy( 0, nPos );
mvDicts[k].aName = aLocation;
++k;
}
}
}
DBG_ASSERT( k == numdict, "index mismatch?" );
}
else
{
// no dictionary found so register no dictionaries
mvDicts.clear();
aSuppLocales.realloc(0);
}
}
return aSuppLocales;
}
sal_Bool SAL_CALL Hyphenator::hasLocale(const Locale& rLocale)
{
MutexGuard aGuard( GetLinguMutex() );
if (!aSuppLocales.hasElements())
getLocales();
return comphelper::findValue(aSuppLocales, rLocale) != -1;
}
namespace {
bool LoadDictionary(HDInfo& rDict)
{
OUString DictFN = rDict.aName + ".dic";
OUString dictpath;
osl::FileBase::getSystemPathFromFileURL(DictFN, dictpath);
#if defined(_WIN32)
// hnj_hyphen_load expects UTF-8 encoded paths with \\?\ long path prefix.
OString sTmp = Win_AddLongPathPrefix(OUStringToOString(dictpath, RTL_TEXTENCODING_UTF8));
#else
OString sTmp(OU2ENC(dictpath, osl_getThreadTextEncoding()));
#endif
HyphenDict *dict = nullptr;
if ((dict = hnj_hyphen_load(sTmp.getStr())) == nullptr)
{
SAL_WARN(
"lingucomponent",
"Couldn't find file " << dictpath);
return false;
}
rDict.aPtr = dict;
rDict.eEnc = getTextEncodingFromCharset(dict->cset);
return true;
}
}
Reference< XHyphenatedWord > SAL_CALL Hyphenator::hyphenate( const OUString& aWord,
const css::lang::Locale& aLocale,
sal_Int16 nMaxLeading,
const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
{
PropertyHelper_Hyphenation& rHelper = GetPropHelper();
rHelper.SetTmpPropVals(aProperties);
sal_Int16 minTrail = rHelper.GetMinTrailing();
sal_Int16 minLead = rHelper.GetMinLeading();
sal_Int16 minCompoundLead = rHelper.GetCompoundMinLeading();
sal_Int16 minLen = rHelper.GetMinWordLength();
bool bNoHyphenateCaps = rHelper.IsNoHyphenateCaps();
rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
Reference< XHyphenatedWord > xRes;
int k = -1;
for (size_t j = 0; j < mvDicts.size(); ++j)
{
if (aLocale == mvDicts[j].aLoc)
k = j;
}
// if we have a hyphenation dictionary matching this locale
if (k != -1)
{
int nHyphenationPos = -1;
int nHyphenationPosAlt = -1;
int nHyphenationPosAltHyph = -1;
// if this dictionary has not been loaded yet do that
if (!mvDicts[k].aPtr)
{
if (!LoadDictionary(mvDicts[k]))
return nullptr;
}
// otherwise hyphenate the word with that dictionary
HyphenDict *dict = mvDicts[k].aPtr;
eEnc = mvDicts[k].eEnc;
CharClass * pCC = mvDicts[k].apCC.get();
// Don't hyphenate uppercase words if requested
if (bNoHyphenateCaps && aWord == makeUpperCase(aWord, pCC))
{
return nullptr;
}
// we don't want to work with a default text encoding since following incorrect
// results may occur only for specific text and thus may be hard to notice.
// Thus better always make a clean exit here if the text encoding is in question.
// Hopefully something not working at all will raise proper attention quickly. ;-)
DBG_ASSERT( eEnc != RTL_TEXTENCODING_DONTKNOW, "failed to get text encoding! (maybe incorrect encoding string in file)" );
if (eEnc == RTL_TEXTENCODING_DONTKNOW)
return nullptr;
CapType ct = capitalType(aWord, pCC);
// first convert any smart quotes or apostrophes to normal ones
OUStringBuffer rBuf(aWord);
sal_Int32 nc = rBuf.getLength();
sal_Unicode ch;
for (sal_Int32 ix=0; ix < nc; ix++)
{
ch = rBuf[ix];
if ((ch == 0x201C) || (ch == 0x201D))
rBuf[ix] = u'"';
if ((ch == 0x2018) || (ch == 0x2019))
rBuf[ix] = u'\'';
}
OUString nWord(rBuf.makeStringAndClear());
// now convert word to all lowercase for pattern recognition
OUString nTerm(makeLowerCase(nWord, pCC));
// now convert word to needed encoding
OString encWord(OU2ENC(nTerm,eEnc));
int wordlen = encWord.getLength();
std::unique_ptr<char[]> lcword(new char[wordlen + 1]);
std::unique_ptr<char[]> hyphens(new char[wordlen + 5]);
char ** rep = nullptr; // replacements of discretionary hyphenation
int * pos = nullptr; // array of [hyphenation point] minus [deletion position]
int * cut = nullptr; // length of deletions in original word
// copy converted word into simple char buffer
strcpy(lcword.get(),encWord.getStr());
// now strip off any ending periods
int n = wordlen-1;
while((n >=0) && (lcword[n] == '.'))
n--;
n++;
if (n > 0)
{
const bool bFailed = 0 != hnj_hyphen_hyphenate3( dict, lcword.get(), n, hyphens.get(), nullptr,
&rep, &pos, &cut, minLead, minTrail,
std::max<sal_Int16>(dict->clhmin, std::max<sal_Int16>(dict->clhmin, 2) + std::max(0, minLead - std::max<sal_Int16>(dict->lhmin, 2))),
std::max<sal_Int16>(dict->crhmin, std::max<sal_Int16>(dict->crhmin, 2) + std::max(0, minTrail - std::max<sal_Int16>(dict->rhmin, 2))) );
if (bFailed)
{
// whoops something did not work
if (rep)
{
for(int j = 0; j < n; j++)
{
if (rep[j]) free(rep[j]);
}
free(rep);
}
if (pos) free(pos);
if (cut) free(cut);
return nullptr;
}
}
// now backfill hyphens[] for any removed trailing periods
for (int c = n; c < wordlen; c++) hyphens[c] = '0';
hyphens[wordlen] = '\0';
sal_Int32 Leading = GetPosInWordToCheck( aWord, nMaxLeading );
// use morphological analysis of Hunspell to get better hyphenation of compound words
// optionally when hyphenation zone is enabled
// pa: fields contain stems resulted by compound word analysis of non-dictionary words
// hy: fields contain hyphenation data of dictionary (compound) words
Reference< XSpellAlternatives > xTmpRes;
bool bAnalyzed = false; // enough the analyse once the word
bool bCompoundHyphenation = true; // try to hyphenate compound words better
OUString sStems; // processed result of the compound word analysis, e.g. com|pound|word
sal_Int32 nSuffixLen = 0; // do not remove break points in suffixes
for (sal_Int32 i = 0; i < n; i++)
{
int leftrep = 0;
bool hit = (n >= minLen);
if (!rep || !rep[i])
{
hit = hit && (hyphens[i]&1) && (i < Leading);
hit = hit && (i >= (minLead-1) );
hit = hit && ((n - i - 1) >= minTrail);
}
else
{
// calculate change character length before hyphenation point signed with '='
for (char * c = rep[i]; *c && (*c != '='); c++)
{
if (eEnc == RTL_TEXTENCODING_UTF8)
{
if (static_cast<unsigned char>(*c) >> 6 != 2)
leftrep++;
}
else
leftrep++;
}
hit = hit && (hyphens[i]&1) && ((i + leftrep - pos[i]) < Leading);
hit = hit && ((i + leftrep - pos[i]) >= (minLead-1) );
hit = hit && ((n - i - 1 + sal::static_int_cast< sal_sSize >(strlen(rep[i])) - leftrep - 1) >= minTrail);
}
if (hit)
{
// skip hyphenation right after stem boundaries in compound words
// if minCompoundLead > 2 (default value: less than n=minCompoundLead character distance)
if ( bCompoundHyphenation && minCompoundLead > 2 && nHyphenationPos > -1 && i - nHyphenationPos < minCompoundLead )
{
uno::Reference< XLinguServiceManager2 > xLngSvcMgr( GetLngSvcMgr_Impl() );
uno::Reference< XSpellChecker1 > xSpell;
LanguageType nLanguage = LinguLocaleToLanguage( aLocale );
xSpell.set( xLngSvcMgr->getSpellChecker(), UNO_QUERY );
// get morphological analysis of the word
if ( ( bAnalyzed && xTmpRes.is() ) || ( xSpell.is() && xSpell->isValid(
SPELLML_SUPPORT, static_cast<sal_uInt16>(nLanguage),
uno::Sequence< beans::PropertyValue >() ) ) )
{
if ( !bAnalyzed )
{
xTmpRes = xSpell->spell( "<?xml?><query type='analyze'><word>" +
aWord + "</word></query>",
static_cast<sal_uInt16>(nLanguage),
uno::Sequence< beans::PropertyValue >() );
bAnalyzed = true;
if (xTmpRes.is())
{
Sequence<OUString>seq = xTmpRes->getAlternatives();
if (seq.hasElements())
{
sal_Int32 nEndOfFirstAnalysis = seq[0].indexOf("</a>");
// FIXME use only the first analysis
OUString morph(
seq[0].copy(0, nEndOfFirstAnalysis));
// concatenate pa: fields, i.e. stems in the analysis:
// pa:stem1 pa:stem2 pa:stem3 -> stem1||stem2||stem3
sal_Int32 nPa = -1;
while ( (nPa = morph.indexOf(u" pa:", nPa + 1)) > -1 )
{
// use hy: field of the actual stem, if it exists
// pa:stem1 hy:st|em1 pa:stem2 -> st|em1||stem2
sal_Int32 nHy = morph.indexOf(u" hy:", nPa + 3);
sal_Int32 nPa2 = morph.indexOf(u" pa:", nPa + 3);
if ( nHy > -1 && ( nPa2 == -1 || nHy < nPa2 ) )
{
OUString sStems2(morph.getToken(1, ' ', nHy).copy(3));
if ( sStems2.indexOf('|') > -1 )
sStems += sStems2+ u"||";
else if ( sal_Int32 nBreak = o3tl::toInt32(sStems2) )
{
OUString sPa(morph.getToken(1, ' ', nPa).copy(3));
if ( nBreak < sPa.getLength() )
sStems += OUString::Concat(sPa.subView(0, nBreak)) + u"|" +
sPa.subView(nBreak);
}
}
else
{
OUString sPa(morph.getToken(1, ' ', nPa).copy(3));
// handle special case: missing pa: in morphological analysis
// before in-word suffixes (German, Sweden etc. dictionaries)
// (recognized by the single last pa:)
if (sStems.isEmpty() && nPa2 == -1 && aWord.endsWith(sPa))
{
sStems = OUString::Concat(aWord.subView(0, aWord.getLength() -
sPa.getLength())) + u"||" +
aWord.subView(aWord.getLength() -
sPa.getLength());
break;
}
sStems += sPa + "||";
// count suffix length
sal_Int32 nSt = morph.lastIndexOf(" st:");
if ( nSt > -1 )
{
sal_Int32 nStemLen =
o3tl::getToken(morph, 1, ' ', nSt).length() - 3;
if ( nStemLen < sPa.getLength() )
nSuffixLen = sPa.getLength() - nStemLen;
}
}
if ( nPa == -1 ) // getToken() can modify nPa
break;
}
// only hy:, but not pa:
if ( sStems.isEmpty() )
{
// check hy: (pre-defined hyphenation)
sal_Int32 nHy = morph.indexOf(" hy:");
if (nHy > -1)
{
sStems = morph.getToken(1, ' ', nHy).copy(3);
if ( sStems.indexOf('|') == -1 && sStems.indexOf('-') == -1 )
{
if ( sal_Int32 nBreak = o3tl::toInt32(sStems) )
{
if ( nBreak < aWord.getLength() )
sStems += OUString::Concat(aWord.subView(0, nBreak)) + u"|" +
aWord.subView(nBreak);
}
}
}
}
}
}
}
// handle string separated by |, e.g "program hy:pro|gram"
if ( sStems.indexOf('|') > -1 )
{
sal_Int32 nLetters = 0; // count not separator characters
sal_Int32 nSepPos = -1; // position of last character | used for stem boundaries
bool bWeightedSep = false; // double separator || = weighted stem boundary
sal_Int32 j = 0;
for (; j < sStems.getLength() && nLetters <= i; j++)
{
if ( sStems[j] == '|' )
{
bWeightedSep = nSepPos > -1 && (j - 1 == nSepPos);
nSepPos = j;
}
else if ( sStems[j] != '-' && sStems[j] != '=' && sStems[j] != '*' )
++nLetters;
}
// skip break points near stem boundaries
if (
// there is a stem boundary before the actual break point
nSepPos > -1 &&
// and the break point is within a stem, i.e. not in the
// suffix of the last stem
i < aWord.getLength() - nSuffixLen - 1 &&
// and it is not another stem boundary
j + 1 < sStems.getLength() &&
( sStems[j + 1] != u'|' ||
// except if it's only the previous was a weighted one
( bWeightedSep && ( j + 2 == sStems.getLength() ||
sStems[j + 2] != u'|' ) ) ) )
{
continue;
}
}
else
// not a compound word
bCompoundHyphenation = false;
}
else
// no SPELLML support, no morphological analysis
bCompoundHyphenation = false;
}
nHyphenationPos = i;
if (rep && rep[i])
{
nHyphenationPosAlt = i - pos[i];
nHyphenationPosAltHyph = i + leftrep - pos[i];
}
}
}
if (nHyphenationPos == -1)
{
xRes = nullptr;
}
else
{
if (rep && rep[nHyphenationPos])
{
// remove equal sign
char * s = rep[nHyphenationPos];
int eq = 0;
for (; *s; s++)
{
if (*s == '=') eq = 1;
if (eq) *s = *(s + 1);
}
OUString repHyphlow(rep[nHyphenationPos], strlen(rep[nHyphenationPos]), eEnc);
OUString repHyph;
switch (ct)
{
case CapType::ALLCAP:
{
repHyph = makeUpperCase(repHyphlow, pCC);
break;
}
case CapType::INITCAP:
{
if (nHyphenationPosAlt == -1)
repHyph = makeInitCap(repHyphlow, pCC);
else
repHyph = repHyphlow;
break;
}
default:
{
repHyph = repHyphlow;
break;
}
}
// handle shortening
sal_Int16 nPos = static_cast<sal_Int16>((nHyphenationPosAltHyph < nHyphenationPos) ?
nHyphenationPosAltHyph : nHyphenationPos);
// discretionary hyphenation
xRes = HyphenatedWord::CreateHyphenatedWord( aWord, LinguLocaleToLanguage( aLocale ), nPos,
aWord.replaceAt(nHyphenationPosAlt + 1, cut[nHyphenationPos], repHyph),
static_cast<sal_Int16>(nHyphenationPosAltHyph));
}
else
{
xRes = HyphenatedWord::CreateHyphenatedWord( aWord, LinguLocaleToLanguage( aLocale ),
static_cast<sal_Int16>(nHyphenationPos), aWord, static_cast<sal_Int16>(nHyphenationPos));
}
}
if (rep)
{
for(int j = 0; j < n; j++)
{
if (rep[j]) free(rep[j]);
}
free(rep);
}
if (pos) free(pos);
if (cut) free(cut);
return xRes;
}
return nullptr;
}
Reference < XHyphenatedWord > SAL_CALL Hyphenator::queryAlternativeSpelling(
const OUString& aWord,
const css::lang::Locale& aLocale,
sal_Int16 nIndex,
const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
{
// Firstly we allow only one plus character before the hyphen to avoid to miss the right break point:
for (int extrachar = 1; extrachar <= 2; extrachar++)
{
Reference< XHyphenatedWord > xRes = hyphenate(aWord, aLocale, nIndex + 1 + extrachar, aProperties);
if (xRes.is() && xRes->isAlternativeSpelling() && xRes->getHyphenationPos() == nIndex)
return xRes;
}
return nullptr;
}
Reference< XPossibleHyphens > SAL_CALL Hyphenator::createPossibleHyphens( const OUString& aWord,
const css::lang::Locale& aLocale,
const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
{
PropertyHelper_Hyphenation& rHelper = GetPropHelper();
rHelper.SetTmpPropVals(aProperties);
sal_Int16 minTrail = rHelper.GetMinTrailing();
sal_Int16 minLead = rHelper.GetMinLeading();
sal_Int16 minLen = rHelper.GetMinWordLength();
// Resolves: fdo#41083 honour MinWordLength in "createPossibleHyphens" as
// well as "hyphenate"
if (aWord.getLength() < minLen)
{
return PossibleHyphens::CreatePossibleHyphens( aWord, LinguLocaleToLanguage( aLocale ),
aWord, Sequence< sal_Int16 >() );
}
int k = -1;
for (size_t j = 0; j < mvDicts.size(); ++j)
{
if (aLocale == mvDicts[j].aLoc)
k = j;
}
// if we have a hyphenation dictionary matching this locale
if (k != -1)
{
HyphenDict *dict = nullptr;
// if this dictionary has not been loaded yet do that
if (!mvDicts[k].aPtr)
{
if (!LoadDictionary(mvDicts[k]))
return nullptr;
}
// otherwise hyphenate the word with that dictionary
dict = mvDicts[k].aPtr;
rtl_TextEncoding eEnc = mvDicts[k].eEnc;
CharClass* pCC = mvDicts[k].apCC.get();
// we don't want to work with a default text encoding since following incorrect
// results may occur only for specific text and thus may be hard to notice.
// Thus better always make a clean exit here if the text encoding is in question.
// Hopefully something not working at all will raise proper attention quickly. ;-)
DBG_ASSERT( eEnc != RTL_TEXTENCODING_DONTKNOW, "failed to get text encoding! (maybe incorrect encoding string in file)" );
if (eEnc == RTL_TEXTENCODING_DONTKNOW)
return nullptr;
// first handle smart quotes both single and double
OUStringBuffer rBuf(aWord);
sal_Int32 nc = rBuf.getLength();
sal_Unicode ch;
for (sal_Int32 ix=0; ix < nc; ix++)
{
ch = rBuf[ix];
if ((ch == 0x201C) || (ch == 0x201D))
rBuf[ix] = u'"';
if ((ch == 0x2018) || (ch == 0x2019))
rBuf[ix] = u'\'';
}
OUString nWord(rBuf.makeStringAndClear());
// now convert word to all lowercase for pattern recognition
OUString nTerm(makeLowerCase(nWord, pCC));
// now convert word to needed encoding
OString encWord(OU2ENC(nTerm,eEnc));
sal_Int32 wordlen = encWord.getLength();
std::unique_ptr<char[]> lcword(new char[wordlen+1]);
std::unique_ptr<char[]> hyphens(new char[wordlen+5]);
char ** rep = nullptr; // replacements of discretionary hyphenation
int * pos = nullptr; // array of [hyphenation point] minus [deletion position]
int * cut = nullptr; // length of deletions in original word
// copy converted word into simple char buffer
strcpy(lcword.get(),encWord.getStr());
// first remove any trailing periods
sal_Int32 n = wordlen-1;
while((n >=0) && (lcword[n] == '.'))
n--;
n++;
if (n > 0)
{
const bool bFailed = 0 != hnj_hyphen_hyphenate3(dict, lcword.get(), n, hyphens.get(), nullptr,
&rep, &pos, &cut, minLead, minTrail,
std::max<sal_Int16>(dict->clhmin, std::max<sal_Int16>(dict->clhmin, 2) + std::max(0, minLead - std::max<sal_Int16>(dict->lhmin, 2))),
std::max<sal_Int16>(dict->crhmin, std::max<sal_Int16>(dict->crhmin, 2) + std::max(0, minTrail - std::max<sal_Int16>(dict->rhmin, 2))) );
if (bFailed)
{
if (rep)
{
for(int j = 0; j < n; j++)
{
if (rep[j]) free(rep[j]);
}
free(rep);
}
if (pos) free(pos);
if (cut) free(cut);
return nullptr;
}
}
// now backfill hyphens[] for any removed periods
for (sal_Int32 c = n; c < wordlen; c++)
hyphens[c] = '0';
hyphens[wordlen] = '\0';
sal_Int32 nHyphCount = 0;
for ( sal_Int32 i = 0; i < encWord.getLength(); i++)
{
if (hyphens[i]&1)
nHyphCount++;
}
Sequence< sal_Int16 > aHyphPos(nHyphCount);
sal_Int16 *pPos = aHyphPos.getArray();
OUStringBuffer hyphenatedWordBuffer;
nHyphCount = 0;
for (sal_Int32 i = 0; i < nWord.getLength(); i++)
{
hyphenatedWordBuffer.append(aWord[i]);
// hyphenation position
if (hyphens[i]&1)
{
// linguistic::PossibleHyphens is stuck with
// css::uno::Sequence<sal_Int16> because of
// css.linguistic2.XPossibleHyphens.getHyphenationPositions, so
// any further positions need to be ignored:
assert(i >= SAL_MIN_INT16);
if (i > SAL_MAX_INT16)
{
SAL_WARN(
"lingucomponent",
"hyphen pos " << i << " > SAL_MAX_INT16 in \"" << aWord
<< "\"");
continue;
}
pPos[nHyphCount] = i;
hyphenatedWordBuffer.append('=');
nHyphCount++;
}
}
OUString hyphenatedWord = hyphenatedWordBuffer.makeStringAndClear();
Reference< XPossibleHyphens > xRes = PossibleHyphens::CreatePossibleHyphens(
aWord, LinguLocaleToLanguage( aLocale ), hyphenatedWord, aHyphPos);
if (rep)
{
for(int j = 0; j < n; j++)
{
if (rep[j]) free(rep[j]);
}
free(rep);
}
if (pos) free(pos);
if (cut) free(cut);
return xRes;
}
return nullptr;
}
OUString Hyphenator::makeLowerCase(const OUString& aTerm, CharClass const * pCC)
{
if (pCC)
return pCC->lowercase(aTerm);
return aTerm;
}
OUString Hyphenator::makeUpperCase(const OUString& aTerm, CharClass const * pCC)
{
if (pCC)
return pCC->uppercase(aTerm);
return aTerm;
}
OUString Hyphenator::makeInitCap(const OUString& aTerm, CharClass const * pCC)
{
sal_Int32 tlen = aTerm.getLength();
if (pCC && tlen)
{
OUString bTemp = aTerm.copy(0,1);
if (tlen > 1)
return ( pCC->uppercase(bTemp, 0, 1) + pCC->lowercase(aTerm,1,(tlen-1)) );
return pCC->uppercase(bTemp, 0, 1);
}
return aTerm;
}
sal_Bool SAL_CALL Hyphenator::addLinguServiceEventListener(
const Reference< XLinguServiceEventListener >& rxLstnr )
{
MutexGuard aGuard( GetLinguMutex() );
bool bRes = false;
if (!bDisposing && rxLstnr.is())
{
bRes = GetPropHelper().addLinguServiceEventListener( rxLstnr );
}
return bRes;
}
sal_Bool SAL_CALL Hyphenator::removeLinguServiceEventListener(
const Reference< XLinguServiceEventListener >& rxLstnr )
{
MutexGuard aGuard( GetLinguMutex() );
bool bRes = false;
if (!bDisposing && rxLstnr.is())
{
bRes = GetPropHelper().removeLinguServiceEventListener( rxLstnr );
}
return bRes;
}
OUString SAL_CALL Hyphenator::getServiceDisplayName(const Locale& rLocale)
{
std::locale loc(Translate::Create("svt", LanguageTag(rLocale)));
return Translate::get(STR_DESCRIPTION_LIBHYPHEN, loc);
}
void SAL_CALL Hyphenator::initialize( const Sequence< Any >& rArguments )
{
MutexGuard aGuard( GetLinguMutex() );
if (pPropHelper)
return;
sal_Int32 nLen = rArguments.getLength();
if (2 == nLen)
{
Reference< XLinguProperties > xPropSet;
rArguments.getConstArray()[0] >>= xPropSet;
// rArguments.getConstArray()[1] >>= xDicList;
//! Pointer allows for access of the non-UNO functions.
//! And the reference to the UNO-functions while increasing
//! the ref-count and will implicitly free the memory
//! when the object is no longer used.
pPropHelper.reset( new PropertyHelper_Hyphenation( static_cast<XHyphenator *>(this), xPropSet ) );
pPropHelper->AddAsPropListener(); //! after a reference is established
}
else {
OSL_FAIL( "wrong number of arguments in sequence" );
}
}
void SAL_CALL Hyphenator::dispose()
{
MutexGuard aGuard( GetLinguMutex() );
if (!bDisposing)
{
bDisposing = true;
EventObject aEvtObj( static_cast<XHyphenator *>(this) );
aEvtListeners.disposeAndClear( aEvtObj );
if (pPropHelper)
{
pPropHelper->RemoveAsPropListener();
pPropHelper.reset();
}
}
}
void SAL_CALL Hyphenator::addEventListener( const Reference< XEventListener >& rxListener )
{
MutexGuard aGuard( GetLinguMutex() );
if (!bDisposing && rxListener.is())
aEvtListeners.addInterface( rxListener );
}
void SAL_CALL Hyphenator::removeEventListener( const Reference< XEventListener >& rxListener )
{
MutexGuard aGuard( GetLinguMutex() );
if (!bDisposing && rxListener.is())
aEvtListeners.removeInterface( rxListener );
}
// Service specific part
OUString SAL_CALL Hyphenator::getImplementationName()
{
return u"org.openoffice.lingu.LibHnjHyphenator"_ustr;
}
sal_Bool SAL_CALL Hyphenator::supportsService( const OUString& ServiceName )
{
return cppu::supportsService(this, ServiceName);
}
Sequence< OUString > SAL_CALL Hyphenator::getSupportedServiceNames()
{
return { SN_HYPHENATOR };
}
extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
lingucomponent_Hyphenator_get_implementation(
css::uno::XComponentContext* , css::uno::Sequence<css::uno::Any> const&)
{
return cppu::acquire(new Hyphenator());
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
↑ V530 The return value of function 'append' is required to be utilized.
↑ V530 The return value of function 'append' is required to be utilized.
↑ V595 The 'cut' pointer was utilized before it was verified against nullptr. Check lines: 630, 649.