Source code for kokorog2p.goruut_g2p

"""Goruut-only G2P for languages supported by pygoruut.

This module provides a simple G2P implementation that uses pygoruut
directly for phonemization. It's an alternative to espeak-based G2P
for languages supported by goruut.

Copyright 2024 kokorog2p contributors
Licensed under the Apache License, Version 2.0
"""

import logging
import re

from kokorog2p.base import G2PBase
from kokorog2p.token import GToken
from kokorog2p.tokenization import ensure_gtoken_positions

logger = logging.getLogger(__name__)


[docs] class GoruutOnlyG2P(G2PBase): """G2P implementation using only pygoruut/goruut. This is used as an alternative to espeak for languages that pygoruut supports well. It provides phonemization via the goruut engine. Example: >>> g2p = GoruutOnlyG2P("fr") >>> tokens = g2p("Bonjour le monde") """
[docs] def __init__( self, language: str = "en-us", use_espeak_fallback: bool = False, # Not used for this class use_goruut_fallback: bool = True, # Not used for this class strict: bool = True, version: str = "1.0", **kwargs, ) -> None: """Initialize the goruut-only G2P. Args: language: Language code (e.g., 'fr', 'de', 'en-us'). use_espeak_fallback: Ignored (always uses goruut). use_goruut_fallback: Ignored (always uses goruut). strict: If True (default), raise exceptions on errors. If False, log warnings and return empty results for backward compatibility. version: Model version (default: "1.0"). **kwargs: Additional arguments (ignored). """ super().__init__( language=language, use_espeak_fallback=False, use_goruut_fallback=True, strict=strict, ) self.version = version self._goruut_backend = None
@property def goruut_backend(self): """Lazy initialization of goruut backend.""" if self._goruut_backend is None: from kokorog2p.backends.goruut import GoruutBackend self._goruut_backend = GoruutBackend( language=self.language, with_stress=True, ) return self._goruut_backend
[docs] def __call__(self, text: str) -> list[GToken]: """Convert text to tokens with phonemes. Args: text: Input text to convert. Returns: List of GToken objects with phonemes. """ if not text or not text.strip(): return [] tokens = [] # Simple tokenization by whitespace and punctuation # Split keeping punctuation as separate tokens pattern = r"(\s+|[,.!?;:\"'()\[\]{}—–\-])" parts = re.split(pattern, text) for part in parts: if not part: continue if part.isspace(): # Add whitespace to previous token if tokens: tokens[-1].whitespace = part continue # Check if punctuation if len(part) == 1 and part in ",.!?;:\"'()[]{}—–-": token = GToken( text=part, tag="PUNCT", whitespace="", phonemes=part, # Keep punctuation as-is ) tokens.append(token) continue # Phonemize using goruut try: phonemes = self.goruut_backend.word_phonemes(part) except Exception as e: if self.strict: if isinstance(e, RuntimeError): raise RuntimeError( f"GoruutOnlyG2P failed to process word '{part}' " f"with goruut. This usually means pygoruut is not " f"properly installed or initialized. " f"Original error: {e}" ) from e else: raise RuntimeError( f"Unexpected error processing word '{part}': {e}" ) from e else: logger.error( f"GoruutOnlyG2P failed to process word '{part}': {e}. " f"Returning None (strict=False mode)." ) phonemes = None token = GToken( text=part, tag="X", # Unknown tag whitespace="", phonemes=phonemes if phonemes else None, ) token.rating = "goruut" if phonemes else None tokens.append(token) ensure_gtoken_positions(tokens, text) return tokens
[docs] def lookup(self, word: str, tag: str | None = None) -> str | None: """Look up a word using goruut. Args: word: The word to look up. tag: Optional POS tag (ignored for goruut). Returns: Phoneme string from goruut, or None if strict=False and error occurs. Raises: RuntimeError: If goruut backend fails and strict=True. """ try: return self.goruut_backend.word_phonemes(word) except Exception as e: if self.strict: if isinstance(e, RuntimeError): raise RuntimeError( f"GoruutOnlyG2P.lookup() failed for word '{word}' with goruut. " f"Original error: {e}" ) from e else: raise RuntimeError( f"Unexpected error in lookup for '{word}': {e}" ) from e else: logger.error( f"GoruutOnlyG2P.lookup() failed for word '{word}': {e}. " f"Returning None (strict=False mode)." ) return None
[docs] def phonemize(self, text: str) -> str: """Convert text to phonemes using goruut. Args: text: Input text to convert. Returns: Phoneme string, or empty string if strict=False and error occurs. Raises: RuntimeError: If goruut backend fails and strict=True. """ try: return self.goruut_backend.phonemize(text) except Exception as e: if self.strict: if isinstance(e, RuntimeError): # goruut initialization or configuration errors raise RuntimeError( f"GoruutOnlyG2P failed to phonemize text with goruut. " f"This usually means pygoruut is not properly " f"installed or initialized. Original error: {e}" ) from e else: # Unexpected errors - don't hide them! raise RuntimeError( f"Unexpected error in GoruutOnlyG2P.phonemize(): {e}" ) from e else: logger.error( f"GoruutOnlyG2P.phonemize() failed: {e}. " f"Returning empty string (strict=False mode)." ) return ""
[docs] @staticmethod def is_available() -> bool: """Check if pygoruut is available. Returns: True if pygoruut can be imported. """ try: from kokorog2p.backends.goruut import GoruutBackend return GoruutBackend.is_available() except ImportError: return False
def __repr__(self) -> str: return f"GoruutOnlyG2P(language={self.language!r})"