www.ravenbrook.com - texescape.py

"""TeX escaping helper."""

import re
from typing import Dict

tex_replacements = [
    # map TeX special chars
    ('$', r'\$'),
    ('%', r'\%'),
    ('&', r'\&'),
    ('#', r'\#'),
    ('_', r'\_'),
    ('{', r'\{'),
    ('}', r'\}'),
    ('\\', r'\textbackslash{}'),
    ('~', r'\textasciitilde{}'),
    ('^', r'\textasciicircum{}'),
    # map chars to avoid mis-interpretation in LaTeX
    ('[', r'{[}'),
    (']', r'{]}'),
    # map special Unicode characters to TeX commands
    ('✓', r'\(\checkmark\)'),
    ('✔', r'\(\pmb{\checkmark}\)'),
    ('✕', r'\(\times\)'),
    ('✖', r'\(\pmb{\times}\)'),
    # used to separate -- in options
    ('', r'{}'),
    # map some special Unicode characters to similar ASCII ones
    # (even for Unicode LaTeX as may not be supported by OpenType font)
    ('⎽', r'\_'),
    ('ℯ', r'e'),
    ('ⅈ', r'i'),
    # Greek alphabet not escaped: pdflatex handles it via textalpha and inputenc
    # OHM SIGN U+2126 is handled by LaTeX textcomp package
]

# A map to avoid TeX ligatures or character replacements in PDF output
# xelatex/lualatex/uplatex are handled differently (#5790, #6888)
ascii_tex_replacements = [
    # Note: the " renders curly in OT1 encoding but straight in T1, T2A, LY1...
    #       escaping it to \textquotedbl would break documents using OT1
    #       Sphinx does \shorthandoff{"} to avoid problems with some languages
    # There is no \text... LaTeX escape for the hyphen character -
    ('-', r'\sphinxhyphen{}'),  # -- and --- are TeX ligatures
    # ,, is a TeX ligature in T1 encoding, but escaping the comma adds
    # complications (whether by {}, or a macro) and is not done
    # the next two require textcomp package
    ("'", r'\textquotesingle{}'),  # else ' renders curly, and '' is a ligature
    ('`', r'\textasciigrave{}'),   # else \` and \`\` render curly
    ('<', r'\textless{}'),     # < is inv. exclam in OT1, << is a T1-ligature
    ('>', r'\textgreater{}'),  # > is inv. quest. mark in 0T1, >> a T1-ligature
]

# A map Unicode characters to LaTeX representation
# (for LaTeX engines which don't support unicode)
unicode_tex_replacements = [
    # map some more common Unicode characters to TeX commands
    ('¶', r'\P{}'),
    ('§', r'\S{}'),
    ('€', r'\texteuro{}'),
    ('∞', r'\(\infty\)'),
    ('±', r'\(\pm\)'),
    ('→', r'\(\rightarrow\)'),
    ('‣', r'\(\rightarrow\)'),
    ('–', r'\textendash{}'),
    # superscript
    ('⁰', r'\(\sp{\text{0}}\)'),
    ('¹', r'\(\sp{\text{1}}\)'),
    ('²', r'\(\sp{\text{2}}\)'),
    ('³', r'\(\sp{\text{3}}\)'),
    ('⁴', r'\(\sp{\text{4}}\)'),
    ('⁵', r'\(\sp{\text{5}}\)'),
    ('⁶', r'\(\sp{\text{6}}\)'),
    ('⁷', r'\(\sp{\text{7}}\)'),
    ('⁸', r'\(\sp{\text{8}}\)'),
    ('⁹', r'\(\sp{\text{9}}\)'),
    # subscript
    ('₀', r'\(\sb{\text{0}}\)'),
    ('₁', r'\(\sb{\text{1}}\)'),
    ('₂', r'\(\sb{\text{2}}\)'),
    ('₃', r'\(\sb{\text{3}}\)'),
    ('₄', r'\(\sb{\text{4}}\)'),
    ('₅', r'\(\sb{\text{5}}\)'),
    ('₆', r'\(\sb{\text{6}}\)'),
    ('₇', r'\(\sb{\text{7}}\)'),
    ('₈', r'\(\sb{\text{8}}\)'),
    ('₉', r'\(\sb{\text{9}}\)'),
]

# TODO: this should be called tex_idescape_map because its only use is in
#       sphinx.writers.latex.LaTeXTranslator.idescape()
# %, {, }, \, #, and ~ are the only ones which must be replaced by _ character
# It would be simpler to define it entirely here rather than in init().
# Unicode replacements are superfluous, as idescape() uses backslashreplace
tex_replace_map: Dict[int, str] = {}

_tex_escape_map: Dict[int, str] = {}
_tex_escape_map_without_unicode: Dict[int, str] = {}
_tex_hlescape_map: Dict[int, str] = {}
_tex_hlescape_map_without_unicode: Dict[int, str] = {}


def escape(s: str, latex_engine: str = None) -> str:
    """Escape text for LaTeX output."""
    if latex_engine in ('lualatex', 'xelatex'):
        # unicode based LaTeX engine
        return s.translate(_tex_escape_map_without_unicode)
    else:
        return s.translate(_tex_escape_map)


def hlescape(s: str, latex_engine: str = None) -> str:
    """Escape text for LaTeX highlighter."""
    if latex_engine in ('lualatex', 'xelatex'):
        # unicode based LaTeX engine
        return s.translate(_tex_hlescape_map_without_unicode)
    else:
        return s.translate(_tex_hlescape_map)


def escape_abbr(text: str) -> str:
    """Adjust spacing after abbreviations. Works with @ letter or other."""
    return re.sub(r'\.(?=\s|$)', r'.\@{}', text)


def init() -> None:
    for a, b in tex_replacements:
        _tex_escape_map[ord(a)] = b
        _tex_escape_map_without_unicode[ord(a)] = b
        tex_replace_map[ord(a)] = '_'

    # no reason to do this for _tex_escape_map_without_unicode
    for a, b in ascii_tex_replacements:
        _tex_escape_map[ord(a)] = b

    # but the hyphen has a specific PDF bookmark problem
    # https://github.com/latex3/hyperref/issues/112
    _tex_escape_map_without_unicode[ord('-')] = r'\sphinxhyphen{}'

    for a, b in unicode_tex_replacements:
        _tex_escape_map[ord(a)] = b
        #  This is actually unneeded:
        tex_replace_map[ord(a)] = '_'

    for a, b in tex_replacements:
        if a in '[]{}\\':
            continue
        _tex_hlescape_map[ord(a)] = b
        _tex_hlescape_map_without_unicode[ord(a)] = b

    for a, b in unicode_tex_replacements:
        _tex_hlescape_map[ord(a)] = b