Source code for kokorog2p.base

"""Abstract base class for G2P (Grapheme-to-Phoneme) converters."""

from abc import ABC, abstractmethod

from .token import GToken


[docs] class G2PBase(ABC): """ Abstract base class for grapheme-to-phoneme converters. Subclasses must implement the `__call__` method to convert text to phonemes. """
[docs] def __init__( self, language: str = "en-us", use_espeak_fallback: bool = True, use_goruut_fallback: bool = False, use_cli: bool = False, strict: bool = True, ) -> None: """ Initialize the G2P converter. Args: language: Language code (e.g., 'en-us', 'en-gb'). use_espeak_fallback: Whether to use espeak for OOV words. use_goruut_fallback: Whether to use goruut for OOV words. use_cli: If True, use CLI phonemizer instead of library bindings. strict: If True, raise exceptions on errors. If False, log warnings and return empty results (backward compatible mode). """ self.language = language self.use_espeak_fallback = use_espeak_fallback self.use_goruut_fallback = use_goruut_fallback self.use_cli = use_cli self.strict = strict self.load_silver: bool | None = None self.load_gold: bool | None = None
@property def is_british(self) -> bool: """Check if this is British English.""" return self.language.lower() in ("en-gb", "en_gb", "british", "gb")
[docs] @abstractmethod def __call__(self, text: str) -> list[GToken]: """ Convert text to a list of tokens with phonemes. Args: text: Input text to convert. Returns: List of GToken objects with phonemes assigned. """ raise NotImplementedError
[docs] def phonemize(self, text: str) -> str: """ Convert text to a phoneme string. This is a convenience method that calls __call__ and joins the results. Args: text: Input text to convert. Returns: Phoneme string with word boundaries. """ tokens = self(text) result: list[str] = [] for token in tokens: if token.phonemes: result.append(token.phonemes) if token.whitespace: result.append(token.whitespace) elif token.is_punctuation: # Keep punctuation as-is result.append(token.text) if token.whitespace: result.append(token.whitespace) return "".join(result).strip()
[docs] def word_to_phonemes(self, word: str, tag: str | None = None) -> str | None: """ Convert a single word to phonemes. Args: word: The word to convert. tag: Optional POS tag for disambiguation. Returns: Phoneme string or None if conversion failed. """ tokens = self(word) if tokens and tokens[0].phonemes: return tokens[0].phonemes return None
[docs] @abstractmethod def lookup(self, word: str, tag: str | None = None) -> str | None: """ Look up a word in the dictionary. Args: word: The word to look up. tag: Optional POS tag for disambiguation. Returns: Phoneme string or None if not found. """ raise NotImplementedError
[docs] def add_abbreviation( self, abbreviation: str, expansion: str | dict[str, str], description: str = "", case_sensitive: bool = False, ) -> None: """Add or update a custom abbreviation (if supported).""" raise NotImplementedError("This G2P does not support abbreviations")
[docs] def remove_abbreviation( self, abbreviation: str, case_sensitive: bool = False ) -> bool: """Remove an abbreviation (if supported).""" return False
[docs] def has_abbreviation(self, abbreviation: str, case_sensitive: bool = False) -> bool: """Check if an abbreviation exists (if supported).""" return False
[docs] def list_abbreviations(self) -> list[str]: """List abbreviations (if supported).""" return []
[docs] def __repr__(self) -> str: """Return a string representation.""" return f"{self.__class__.__name__}(language={self.language!r})"