www.ravenbrook.com - versioning.py

"""Implements the low-level algorithms Sphinx uses for versioning doctrees."""
import pickle
from itertools import product, zip_longest
from operator import itemgetter
from os import path
from typing import TYPE_CHECKING, Any, Dict, Iterator
from uuid import uuid4

from docutils.nodes import Node

from sphinx.transforms import SphinxTransform

if TYPE_CHECKING:
    from sphinx.application import Sphinx

try:
    import Levenshtein
    IS_SPEEDUP = True
except ImportError:
    IS_SPEEDUP = False

# anything below that ratio is considered equal/changed
VERSIONING_RATIO = 65


def add_uids(doctree: Node, condition: Any) -> Iterator[Node]:
    """Add a unique id to every node in the `doctree` which matches the
    condition and yield the nodes.

    :param doctree:
        A :class:`docutils.nodes.document` instance.

    :param condition:
        A callable which returns either ``True`` or ``False`` for a given node.
    """
    for node in doctree.findall(condition):
        node.uid = uuid4().hex
        yield node


def merge_doctrees(old: Node, new: Node, condition: Any) -> Iterator[Node]:
    """Merge the `old` doctree with the `new` one while looking at nodes
    matching the `condition`.

    Each node which replaces another one or has been added to the `new` doctree
    will be yielded.

    :param condition:
        A callable which returns either ``True`` or ``False`` for a given node.
    """
    old_iter = old.findall(condition)
    new_iter = new.findall(condition)
    old_nodes = []
    new_nodes = []
    ratios = {}
    seen = set()
    # compare the nodes each doctree in order
    for old_node, new_node in zip_longest(old_iter, new_iter):
        if old_node is None:
            new_nodes.append(new_node)
            continue
        if not getattr(old_node, 'uid', None):
            # maybe config.gettext_uuid has been changed.
            old_node.uid = uuid4().hex
        if new_node is None:
            old_nodes.append(old_node)
            continue
        ratio = get_ratio(old_node.rawsource, new_node.rawsource)
        if ratio == 0:
            new_node.uid = old_node.uid
            seen.add(new_node)
        else:
            ratios[old_node, new_node] = ratio
            old_nodes.append(old_node)
            new_nodes.append(new_node)
    # calculate the ratios for each unequal pair of nodes, should we stumble
    # on a pair which is equal we set the uid and add it to the seen ones
    for old_node, new_node in product(old_nodes, new_nodes):
        if new_node in seen or (old_node, new_node) in ratios:
            continue
        ratio = get_ratio(old_node.rawsource, new_node.rawsource)
        if ratio == 0:
            new_node.uid = old_node.uid
            seen.add(new_node)
        else:
            ratios[old_node, new_node] = ratio
    # choose the old node with the best ratio for each new node and set the uid
    # as long as the ratio is under a certain value, in which case we consider
    # them not changed but different
    ratios = sorted(ratios.items(), key=itemgetter(1))  # type: ignore
    for (old_node, new_node), ratio in ratios:
        if new_node in seen:
            continue
        else:
            seen.add(new_node)
        if ratio < VERSIONING_RATIO:
            new_node.uid = old_node.uid
        else:
            new_node.uid = uuid4().hex
            yield new_node
    # create new uuids for any new node we left out earlier, this happens
    # if one or more nodes are simply added.
    for new_node in set(new_nodes) - seen:
        new_node.uid = uuid4().hex
        yield new_node


def get_ratio(old: str, new: str) -> float:
    """Return a "similarity ratio" (in percent) representing the similarity
    between the two strings where 0 is equal and anything above less than equal.
    """
    if not all([old, new]):
        return VERSIONING_RATIO

    if IS_SPEEDUP:
        return Levenshtein.distance(old, new) / (len(old) / 100.0)
    else:
        return levenshtein_distance(old, new) / (len(old) / 100.0)


def levenshtein_distance(a: str, b: str) -> int:
    """Return the Levenshtein edit distance between two strings *a* and *b*."""
    if a == b:
        return 0
    if len(a) < len(b):
        a, b = b, a
    if not a:
        return len(b)
    previous_row = list(range(len(b) + 1))
    for i, column1 in enumerate(a):
        current_row = [i + 1]
        for j, column2 in enumerate(b):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (column1 != column2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]


class UIDTransform(SphinxTransform):
    """Add UIDs to doctree for versioning."""
    default_priority = 880

    def apply(self, **kwargs: Any) -> None:
        env = self.env
        old_doctree = None
        if not env.versioning_condition:
            return

        if env.versioning_compare:
            # get old doctree
            try:
                filename = path.join(env.doctreedir, env.docname + '.doctree')
                with open(filename, 'rb') as f:
                    old_doctree = pickle.load(f)
            except OSError:
                pass

        # add uids for versioning
        if not env.versioning_compare or old_doctree is None:
            list(add_uids(self.document, env.versioning_condition))
        else:
            list(merge_doctrees(old_doctree, self.document, env.versioning_condition))


def setup(app: "Sphinx") -> Dict[str, Any]:
    app.add_transform(UIDTransform)

    return {
        'version': 'builtin',
        'parallel_read_safe': True,
        'parallel_write_safe': True,
    }