/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
#pragma once
#include <sal/types.h>
#include <memory>
// Sensible default values for a user interface could be:
// LEVDISDEFAULT_XOTHER 2
// Maximum X replacements to match query, found data may be different by X
// characters from query.
// LEVDISDEFAULT_YSHORTER 1
// Maximum Y insertions to match query, found data may be Y characters
// shorter than query.
// LEVDISDEFAULT_ZLONGER 3
// Maximum Z deletions to match query, found data may be Z characters
// longer than query.
// LEVDISDEFAULT_RELAXED TRUE
// Use relaxed SplitCount instead of mathematical WLD.
//
// Joker/wildcards ('?' and '*') of course do not count as
// replacement/insertion/deletion. At a '?' a replacement is not counted, for a
// '*' the found data may be any number of characters longer than the query.
//
// Strict mathematical WLD: EITHER maximum X replacements OR Y characters
// shorter OR Z characters longer.
// Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
// Z characters longer. Any combination of actions is valid.
//
// The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
// integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
//
// The corresponding internal default weigh values for these user interface
// values would be:
// LEVDISDEFAULTLIMIT 6
// Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
// LEVDISDEFAULT_P0 3
// Default nRepP0, weight of replacements.
// LEVDISDEFAULT_Q0 6
// Default nInsQ0, weight of insertions.
// LEVDISDEFAULT_R0 2
// Default nDelR0, weight of deletions.
// The transformation of user input values to weighs is done using CalcLPQR().
// One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
// characters longer than pattern) then no character can be replaced any more.
// This can be circumvented by increasing nX or/and nZ, but of course with the
// side effect of being less strict then... or the other solution is to use
// relaxed SplitCount (see below), which is the default when using CalcLPQR().
//
// Attention: shorter = WLD.Insert, longer = WLD.Delete
//
// View and counting is always from data string to pattern, a deletion means
// that something is deleted from data to match pattern.
//
// Deletions weigh less in this example because usually less is known than is
// searched for. Replacements get middle weight, for example because of
// misspellings. Insertions are expensive.
//
// Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
// Allowed are maximum 4 replacements, no insertion, no deletion.
// Matches the user interface values X = 3, Y = 0, Z = 0
//
// bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
// of WLD() then isn't necessarily the Levenshtein-Distance, but can be
// negative (-WLD) if the WLD is greater than nLimit but single values are
// within the limits.
// For the above default values that could mean: even if the found string is
// already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
// to reach pattern. Additionally, character swaps count as one replacement.
// Mathematically completely incorrect, but meets user expectations ;-)
//
// Explanation: in the real WLD all actions are withdrawn from a common 100%
// pool, if one gets all there's nothing left for others. With bSplitCount
// replacements have their own pool.
/** "Safe" memory allocation in ctor */
class WLevDisPatternMem
{
std::unique_ptr<sal_Unicode[]> cp;
std::unique_ptr<bool[]> bp;
public:
explicit WLevDisPatternMem( sal_Int32 s )
: cp(new sal_Unicode[s])
, bp(new bool[s])
{
}
sal_Unicode* GetcPtr() const { return cp.get(); }
bool* GetbPtr() const { return bp.get(); }
};
class WLevDisDistanceMem
{
std::unique_ptr<int[]> p;
public:
explicit WLevDisDistanceMem( size_t s )
{
NewMem(s);
}
int* GetPtr() const { return p.get(); }
int* NewMem( size_t s )
{
p.reset(new int[ s<3 ? 3 : s ]);
return p.get();
}
};
/** Weighted Levenshtein Distance (WLD)
For a more detailed explanation see documentation in
i18npool/source/search/levdis.hxx
*/
class WLevDistance
{
sal_Int32 nPatternLen; ///< length of pattern
WLevDisPatternMem aPatMem; ///< manage allocation of pattern array
sal_Unicode* cpPattern; ///< pointer to pattern array
bool* bpPatIsWild; ///< pointer to bool array whether pattern is wildcard
sal_Int32 nArrayLen; ///< length of distance array
WLevDisDistanceMem aDisMem; ///< manage allocation of distance array
int* npDistance; ///< pointer to distance array
int nLimit; ///< WLD limit replacements/insertions/deletions
int nRepP0; ///< replacement weigh
int nInsQ0; ///< insertion weigh
int nDelR0; ///< deletion weigh
int nStars; ///< count of '*' wildcards in pattern
bool bSplitCount; ///< if TRUE, Rep/Ins/Del are counted separately
void InitData( const sal_Unicode* cPattern );
static int Mid3( int x, int y, int z ); ///< middle value of 3 values
public:
/** CTor with user input. Internally calls CalcLPQR().
After this, obtain the resulting limit using GetLimit().
@param bRelaxed the mathematically incorrect method is default (TRUE)
*/
WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY,
int nLongerZ, bool bRelaxed );
WLevDistance( const WLevDistance& rWLD );
~WLevDistance();
/** Calculate the Weighted Levenshtein Distance from string to pattern. */
int WLD( const sal_Unicode* cString, sal_Int32 nStringLen );
/** Calculate the internal weighs corresponding to the user input values.
@returns nLimit for later comparison with WLD()
*/
void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ,
bool bRelaxed );
int GetLimit() const { return nLimit; }
// Calculate current balance, keep this inline for performance reasons!
// c == cpPattern[jj] == cString[ii]
// First seek up to found place, if the balance is still equal there then
// also compare after the found place.
int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const
{
int nBalance = 0;
if ( jj != ii )
{
sal_Int32 k;
if ( jj > 0 )
for ( k=0; k < jj; k++ )
if ( cpPattern[k] == c )
nBalance++;
if ( ii > 0 )
for ( k=0; k < ii; k++ )
if ( cString[k] == c )
nBalance--;
if ( !nBalance )
{
for ( k=jj+1; k < nPatternLen; k++ )
if ( cpPattern[k] == c )
nBalance++;
for ( k=ii+1; k < nStringLen; k++ )
if ( cString[k] == c )
nBalance--;
}
}
return nBalance;
}
};
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
↑ V690 The 'WLevDistance' class implements a copy constructor, but lacks the copy assignment operator. It is dangerous to use such a class.