Source code for kokorog2p.fr.lexicon

"""French lexicon for G2P lookup.

Based on misaki French implementation, adapted for kokorog2p.
"""

import importlib.resources
import json
import re
import unicodedata
from dataclasses import dataclass
from typing import Any, Final

from kokorog2p.fr import data

# =============================================================================
# Constants
# =============================================================================

# Valid character ordinals for lexicon lookup (includes French accented chars)
LEXICON_ORDS: Final[list[int]] = [
    39,  # '
    45,  # -
    *range(65, 91),  # A-Z
    *range(97, 123),  # a-z
    192,  # À
    194,  # Â
    196,  # Ä
    199,  # Ç
    200,  # È
    201,  # É
    202,  # Ê
    203,  # Ë
    206,  # Î
    207,  # Ï
    212,  # Ô
    217,  # Ù
    219,  # Û
    220,  # Ü
    224,  # à
    226,  # â
    228,  # ä
    231,  # ç
    232,  # è
    233,  # é
    234,  # ê
    235,  # ë
    238,  # î
    239,  # ï
    244,  # ô
    249,  # ù
    251,  # û
    252,  # ü
    339,  # œ
    338,  # Œ
    230,  # æ
    198,  # Æ
]

# Consonants (French)
CONSONANTS: Final[frozenset[str]] = frozenset("bdfhjklmnpstvwzðŋɲɡʁʃʒ")

# Vowels (French including nasal vowels)
VOWELS: Final[frozenset[str]] = frozenset("aeiouyøœəɛɔɑɑ̃ɛ̃ɔ̃œ̃")

# Semi-vowels
SEMI_VOWELS: Final[frozenset[str]] = frozenset("jwɥ")

# Symbol mappings
SYMBOLS: Final[dict[str, str]] = {
    "%": "pour cent",
    "&": "et",
    "+": "plus",
    "@": "arobase",
}

# Currency symbols
CURRENCIES: Final[dict[str, tuple[str, str]]] = {
    "€": ("euro", "centime"),
    "$": ("dollar", "cent"),
    "£": ("livre", "pence"),
}

# Common French abbreviations
ABBREVIATIONS: Final[dict[str, str]] = {
    # Titles
    "M.": "monsieur",
    "Mme": "madame",
    "Mlle": "mademoiselle",
    "Dr": "docteur",
    "Pr": "professeur",
    "Me": "maître",
    "Mgr": "monseigneur",
    "St": "saint",
    "Ste": "sainte",
    # Common abbreviations
    "etc.": "et cetera",
    "cf.": "confer",
    "ex.": "exemple",
    "n°": "numéro",
    "N°": "numéro",
    "p.": "page",
    "pp.": "pages",
    "vol.": "volume",
    "chap.": "chapitre",
    "éd.": "édition",
    "env.": "environ",
    "min.": "minute",
    "sec.": "seconde",
    "h": "heure",
    "km": "kilomètre",
    "m": "mètre",
    "cm": "centimètre",
    "mm": "millimètre",
    "kg": "kilogramme",
    "g": "gramme",
    "mg": "milligramme",
    "l": "litre",
    "ml": "millilitre",
}

# Ordinal suffixes
ORDINALS: Final[dict[str, str]] = {
    "1er": "premier",
    "1ère": "première",
    "1re": "première",
    "2e": "deuxième",
    "2ème": "deuxième",
    "2nd": "second",
    "2nde": "seconde",
    "3e": "troisième",
    "3ème": "troisième",
}


# =============================================================================
# Helper Classes
# =============================================================================


@dataclass
class TokenContext:
    """Context information for token processing."""

    future_vowel: bool | None = None
    liaison: bool = False


# =============================================================================
# Lexicon Class
# =============================================================================



[docs]
class FrenchLexicon:
    """Dictionary-based G2P lookup for French with gold dictionary."""


[docs]
    def __init__(self, load_silver: bool = True, load_gold: bool = True) -> None:
        """Initialize the French lexicon.

        Args:
            load_silver: If True, load silver tier dictionary if available.
                Currently French only has gold dictionary, so this parameter
                is reserved for future use and consistency with English.
                Defaults to True for consistency.
            load_gold: If True, load gold tier dictionary.
                Defaults to True for maximum quality and coverage.
                Set to False when ultra-fast initialization is needed.
        """
        self.load_silver = load_silver
        self.load_gold = load_gold
        self.golds: dict[str, str | dict[str, str | None]] = {}
        self.silvers: dict[str, str] = {}

        # Load gold dictionary if requested
        if load_gold:
            files = importlib.resources.files(data)
            with (files / "fr_gold.json").open("r", encoding="utf-8") as r:
                self.golds = self._grow_dictionary(json.load(r))

        # Silver dictionary not yet available for French
        # When available, load it conditionally:
        # if load_silver:
        #     with importlib.resources.open_text(data, "fr_silver.json") as r:
        #         self.silvers = self._grow_dictionary(json.load(r))

        # Initialize built-in pronunciation fixes (highest priority)
        self._init_builtin_fixes()


    def _init_builtin_fixes(self) -> None:
        """Initialize built-in pronunciation corrections.

        These override dictionary pronunciations for common errors.
        """
        self.builtin: dict[str, str] = {
            # Verbs with -ait/-ais (imparfait) - often mispronounced
            "était": "etɛ",
            "étais": "etɛ",
            "étaient": "etɛ",
            "avait": "avɛ",
            "avais": "avɛ",
            "avaient": "avɛ",
            "fait": "fɛ",
            "fais": "fɛ",
            "faite": "fɛt",
            "faites": "fɛt",
            "savait": "savɛ",
            "savais": "savɛ",
            "disait": "dizɛ",
            "faisait": "fəzɛ",
            "allait": "alɛ",
            "venait": "vənɛ",
            "devait": "dəvɛ",
            "pouvait": "puvɛ",
            "voulait": "vulɛ",
            # Common words
            "monsieur": "məsjø",
            "messieurs": "mesjø",
            "madame": "madam",
            "mademoiselle": "madmwazɛl",
            "aujourd'hui": "oʒuʁdɥi",
            # Silent letters and liaisons
            "les": "le",
            "des": "de",
            "est": "ɛ",
            "et": "e",
        }

    @staticmethod
    def _grow_dictionary(d: dict[str, Any]) -> dict[str, Any]:
        """Expand dictionary with capitalization variants.

        Args:
            d: Original dictionary.

        Returns:
            Expanded dictionary with capitalized variants.
        """
        e: dict[str, Any] = {}
        for k, v in d.items():
            if len(k) < 2:
                continue
            if k == k.lower():
                cap = k.capitalize()
                if k != cap:
                    e[cap] = v
            elif k == k.lower().capitalize():
                e[k.lower()] = v
        return {**e, **d}


[docs]
    def is_known(self, word: str, tag: str | None = None) -> bool:
        """Check if a word is in the lexicon."""
        word_lower = word.lower()
        return (
            word in self.golds
            or word_lower in self.golds
            or word_lower in self.builtin
            or word in SYMBOLS
        )



[docs]
    def lookup(
        self,
        word: str,
        tag: str | None = None,
        ctx: TokenContext | None = None,
    ) -> tuple[str | None, int | None]:
        """Look up a word in the lexicon.

        Args:
            word: Word to look up.
            tag: POS tag (optional).
            ctx: Token context (optional).

        Returns:
            Tuple of (phonemes, rating) or (None, None) if not found.
        """
        word_lower = word.lower()

        # Check built-in fixes first (highest priority after gold)
        if word_lower in self.builtin:
            return (self.builtin[word_lower], 4)

        # Check gold dictionary
        ps = self.golds.get(word) or self.golds.get(word_lower)

        if ps is None:
            return (None, None)

        # Handle heteronyms (dict entries)
        if isinstance(ps, dict):
            if isinstance(ps, dict):
                if tag and tag in ps:
                    return (ps[tag], 4)
                return (ps.get("DEFAULT", list(ps.values())[0]), 4)
        return (ps, 4)



[docs]
    def expand_abbreviation(self, text: str) -> str:
        """Expand common French abbreviations."""
        for abbr, expansion in ABBREVIATIONS.items():
            pattern = re.escape(abbr)
            if abbr.endswith("."):
                text = re.sub(
                    rf"\b{pattern}(?=\s|$|[,;:!?])",
                    expansion,
                    text,
                    flags=re.IGNORECASE,
                )
            else:
                text = re.sub(rf"\b{pattern}\b", expansion, text, flags=re.IGNORECASE)
        return text



[docs]
    def expand_ordinals(self, text: str) -> str:
        """Expand ordinal numbers."""
        for ordinal, expansion in ORDINALS.items():
            text = re.sub(
                rf"\b{re.escape(ordinal)}\b", expansion, text, flags=re.IGNORECASE
            )
        return text



[docs]
    def get_special_case(
        self,
        word: str,
        tag: str | None,
        ctx: TokenContext | None,
    ) -> tuple[str | None, int | None]:
        """Handle special case words with context-dependent pronunciations."""
        if word in SYMBOLS:
            return self.lookup(SYMBOLS[word], None, ctx)
        return (None, None)



[docs]
    @staticmethod
    def normalize_word(word: str) -> str:
        """Normalize a word for lookup."""
        # Replace curly quotes
        word = word.replace(chr(8216), "'").replace(chr(8217), "'")
        # Normalize unicode
        word = unicodedata.normalize("NFC", word)
        return word



[docs]
    def __call__(
        self,
        word: str,
        tag: str | None = None,
        ctx: TokenContext | None = None,
    ) -> tuple[str | None, int | None]:
        """Look up phonemes for a word.

        Args:
            word: Word to look up.
            tag: POS tag.
            ctx: Token context.

        Returns:
            Tuple of (phonemes, rating) or (None, None) if not found.
        """
        # Normalize the word
        word = self.normalize_word(word)

        # Check special cases first
        ps, rating = self.get_special_case(word, tag, ctx)
        if ps is not None:
            return (ps, rating)

        # Standard lookup
        return self.lookup(word, tag, ctx)