Source code for kokorog2p.de.lexicon

"""German lexicon for G2P lookup.

Provides dictionary-based phoneme lookup for German words.
"""

import importlib.resources
import json
from functools import lru_cache

from kokorog2p.de import data


@lru_cache(maxsize=1)
def _load_gold_dictionary(load_gold: bool = True) -> dict[str, str]:
    """Load the German gold dictionary.

    Args:
        load_gold: If True, load the dictionary; if False, return empty dict.

    Returns:
        Dictionary mapping lowercase words to IPA phonemes.
    """
    if not load_gold:
        return {}
    files = importlib.resources.files(data)
    with (files / "de_gold.json").open("r", encoding="utf-8") as f:
        return json.load(f)


[docs] class GermanLexicon: """German pronunciation lexicon. Uses a gold dictionary for lookup with optional fallback. Example: >>> lexicon = GermanLexicon() >>> lexicon.lookup("Haus") 'haʊ̯s' """
[docs] def __init__( self, strip_stress: bool = False, load_silver: bool = True, load_gold: bool = True, ) -> None: """Initialize the German lexicon. Args: strip_stress: If True, remove stress markers from phonemes. load_silver: If True, load silver tier dictionary if available. Currently German only has gold dictionary, so this parameter is reserved for future use and consistency with English. Defaults to True for consistency. load_gold: If True, load gold tier dictionary. Defaults to True for maximum quality and coverage. Set to False when ultra-fast initialization is needed. """ self._gold = _load_gold_dictionary(load_gold=load_gold) self._strip_stress = strip_stress self.load_silver = load_silver self.load_gold = load_gold
# Silver dictionary not yet available for German
[docs] def lookup(self, word: str, tag: str | None = None) -> str | None: """Look up a word in the lexicon. Args: word: The word to look up. tag: Optional POS tag (not used for German). Returns: IPA phoneme string if found, None otherwise. """ word_lower = word.lower() phonemes = self._gold.get(word_lower) if phonemes and self._strip_stress: # Remove primary and secondary stress markers phonemes = phonemes.replace("ˈ", "").replace("ˌ", "") return phonemes
[docs] def __call__(self, word: str, tag: str | None = None) -> str | None: """Look up a word in the lexicon. Args: word: The word to look up. tag: Optional POS tag. Returns: IPA phoneme string if found, None otherwise. """ return self.lookup(word, tag)
[docs] def is_known(self, word: str) -> bool: """Check if a word is in the lexicon. Args: word: The word to check. Returns: True if the word is in the lexicon. """ return word.lower() in self._gold
[docs] def __len__(self) -> int: """Return the number of entries in the lexicon.""" return len(self._gold)
[docs] def __repr__(self) -> str: """Return string representation.""" return f"GermanLexicon(entries={len(self)})"