Source code for abydos.distance._eudex

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._eudex.

eudex distance functions
"""

from typing import (
    Any,
    Callable,
    Generator,
    Iterable,
    List,
    Optional,
    Union,
    cast,
)

from ._distance import _Distance
from ..phonetic import Eudex as EudexPhonetic

__all__ = ['Eudex']


[docs]class Eudex(_Distance): """Distance between the Eudex hashes of two terms. Cf. :cite:`Ticki:2016`. .. versionadded:: 0.3.6 """
[docs] @staticmethod def gen_fibonacci() -> Generator[float, None, None]: """Yield the next Fibonacci number. Based on https://www.python-course.eu/generators.php Starts at Fibonacci number 3 (the second 1) Yields ------ int The next Fibonacci number .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ num_a, num_b = 1, 2 while True: yield num_a num_a, num_b = num_b, num_a + num_b
[docs] @staticmethod def gen_exponential(base: int = 2) -> Generator[float, None, None]: """Yield the next value in an exponential series of the base. Starts at base**0 Parameters ---------- base : int The base to exponentiate Yields ------ int The next power of `base` .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ exp = 0 while True: yield base ** exp exp += 1
def __init__( self, weights: Optional[ Union[ str, Iterable[float], Callable[[], Generator[float, None, None]], ] ] = 'exponential', max_length: int = 8, **kwargs: Any ) -> None: """Initialize Eudex instance. Parameters ---------- weights : str, iterable, or generator function The weights or weights generator function - If set to ``None``, a simple Hamming distance is calculated. - If set to ``exponential``, weight decays by powers of 2, as proposed in the eudex specification: https://github.com/ticki/eudex. - If set to ``fibonacci``, weight decays through the Fibonacci series, as in the eudex reference implementation. - If set to a callable function, this assumes it creates a generator and the generator is used to populate a series of weights. - If set to an iterable, the iterable's values should be integers and will be used as the weights. In all cases, the weights should be ordered or generated from least significant to most significant, so larger values should generally come first. max_length : int The number of characters to encode as a eudex hash **kwargs Arbitrary keyword arguments .. versionadded:: 0.4.0 """ super(Eudex, self).__init__(**kwargs) self._weights = weights self._max_length = max_length self._phonetic_alg = EudexPhonetic(max_length=max_length)
[docs] def dist_abs(self, src: str, tar: str, normalized: bool = False) -> float: """Calculate the distance between the Eudex hashes of two terms. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison normalized : bool Normalizes to [0, 1] if True Returns ------- int The Eudex Hamming distance Examples -------- >>> cmp = Eudex() >>> cmp.dist_abs('cat', 'hat') 128 >>> cmp.dist_abs('Niall', 'Neil') 2 >>> cmp.dist_abs('Colin', 'Cuilen') 10 >>> cmp.dist_abs('ATCG', 'TAGC') 403 >>> cmp = Eudex(weights='fibonacci') >>> cmp.dist_abs('cat', 'hat') 34 >>> cmp.dist_abs('Niall', 'Neil') 2 >>> cmp.dist_abs('Colin', 'Cuilen') 7 >>> cmp.dist_abs('ATCG', 'TAGC') 117 >>> cmp = Eudex(weights=None) >>> cmp.dist_abs('cat', 'hat') 1 >>> cmp.dist_abs('Niall', 'Neil') 1 >>> cmp.dist_abs('Colin', 'Cuilen') 2 >>> cmp.dist_abs('ATCG', 'TAGC') 9 >>> # Using the OEIS A000142: >>> cmp = Eudex(weights=[1, 1, 2, 6, 24, 120, 720, 5040]) >>> cmp.dist_abs('cat', 'hat') 5040 >>> cmp.dist_abs('Niall', 'Neil') 1 >>> cmp.dist_abs('Colin', 'Cuilen') 7 >>> cmp.dist_abs('ATCG', 'TAGC') 15130 .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # Calculate the eudex hashes and XOR them xored = int(self._phonetic_alg.encode(src)) ^ int( self._phonetic_alg.encode(tar) ) # Simple hamming distance (all bits are equal) if not self._weights: binary = bin(xored) distance = binary.count('1') # type: float if normalized: return distance / (len(binary) - 2) return distance # If self._weights is a function, it should create a generator, # which we now use to populate a list weights_list = [] # List[float] if hasattr(self._weights, '__iter__') and not isinstance( self._weights, str ): weights_list = cast(List[float], self._weights)[::-1] weights_gen = None elif callable(self._weights): weights_gen = self._weights() elif self._weights == 'exponential': weights_gen = Eudex.gen_exponential() elif self._weights == 'fibonacci': weights_gen = Eudex.gen_fibonacci() else: raise ValueError('Unrecognized weights value or type.') if isinstance(weights_gen, Generator): weights_list = [ next(weights_gen) for _ in range(self._max_length) ][::-1] # Sum the weighted hamming distance distance = 0 max_distance = 0.0 while (xored or normalized) and weights_list: max_distance += 8 * weights_list[-1] distance += bin(xored & 0xFF).count('1') * weights_list.pop() xored >>= 8 if normalized: distance /= max_distance return distance
[docs] def dist(self, src: str, tar: str) -> float: """Return normalized distance between the Eudex hashes of two terms. This is Eudex distance normalized to [0, 1]. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int The normalized Eudex Hamming distance Examples -------- >>> cmp = Eudex() >>> round(cmp.dist('cat', 'hat'), 12) 0.062745098039 >>> round(cmp.dist('Niall', 'Neil'), 12) 0.000980392157 >>> round(cmp.dist('Colin', 'Cuilen'), 12) 0.004901960784 >>> round(cmp.dist('ATCG', 'TAGC'), 12) 0.197549019608 .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ return self.dist_abs(src, tar, True)
if __name__ == '__main__': import doctest doctest.testmod()