Source code for kokorog2p.de.g2p

"""German G2P (Grapheme-to-Phoneme) converter.

Grapheme to Phoneme for German language using dictionary lookup
with rule-based fallback.

German Phonology features:
- Final obstruent devoicing (Auslautverhärtung)
- Vowel length distinction
- Umlauts (ä, ö, ü)
- ß (Eszett)
- CH as [ç] or [x] depending on context (ich-Laut vs ach-Laut)
- Voicing assimilation in consonant clusters
- Schwa in unstressed syllables

Reference:
https://en.wikipedia.org/wiki/Standard_German_phonology
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Final

from kokorog2p.base import G2PBase
from kokorog2p.pipeline.tokenizer import RegexTokenizer, SpacyTokenizer
from kokorog2p.token import GToken
from kokorog2p.tokenization import ensure_gtoken_positions

if TYPE_CHECKING:
    from kokorog2p.de.lexicon import GermanLexicon

# =============================================================================
# German Phoneme Mappings
# =============================================================================

# Basic IPA mappings for German graphemes
# Note: Many mappings are context-dependent and handled by rules
IPA: Final[dict[str, str]] = {
    # Vowels - short
    "a": "a",
    "e": "ɛ",
    "i": "ɪ",
    "o": "ɔ",
    "u": "ʊ",
    "ä": "ɛ",
    "ö": "œ",
    "ü": "ʏ",
    "y": "ʏ",
    # Vowels - long (marked with doubling or followed by h/single consonant)
    "aa": "aː",
    "ee": "eː",
    "ie": "iː",
    "oo": "oː",
    "uh": "uː",
    "äh": "ɛː",
    "öh": "øː",
    "üh": "yː",
    "ah": "aː",
    "eh": "eː",
    "ih": "iː",
    "oh": "oː",
    # Diphthongs
    "ei": "aɪ",
    "ai": "aɪ",
    "ey": "aɪ",
    "ay": "aɪ",
    "au": "aʊ",
    "eu": "ɔʏ",
    "äu": "ɔʏ",
    # Consonants
    "b": "b",
    "c": "k",
    "d": "d",
    "f": "f",
    "g": "ɡ",
    "h": "h",
    "j": "j",
    "k": "k",
    "l": "l",
    "m": "m",
    "n": "n",
    "p": "p",
    "q": "k",
    "r": "ʁ",
    "s": "z",  # Default voiced, devoiced in certain contexts
    "t": "t",
    "v": "f",  # Usually [f] in German
    "w": "v",
    "x": "ks",
    "z": "ʦ",
    "ß": "s",
    # Digraphs and trigraphs
    "ch": "x",  # Default ach-Laut, ich-Laut handled by rule
    "ck": "k",
    "dt": "t",
    "ng": "ŋ",
    "nk": "ŋk",
    "ph": "f",
    "pf": "pf",
    "qu": "kv",
    "sch": "ʃ",
    "sp": "ʃp",  # Word-initial
    "st": "ʃt",  # Word-initial
    "ss": "s",
    "th": "t",
    "tz": "ʦ",
    "tsch": "ʧ",
    "dsch": "ʤ",
    "chs": "ks",
}

# Voiced-unvoiced consonant pairs for final devoicing
VOICED_TO_UNVOICED: Final[dict[str, str]] = {
    "b": "p",
    "d": "t",
    "g": "k",
    "ɡ": "k",  # IPA g (U+0261)
    "v": "f",
    "z": "s",
    "ʒ": "ʃ",
}


def _is_front_vowel_context(prev_chars: str) -> bool:
    """Check if the previous character(s) form a front vowel context for ich-Laut."""
    prev_lower = prev_chars.lower()
    # Check for front vowels and consonants l, n, r
    if prev_lower in ("i", "e", "ä", "ö", "ü", "y"):
        return True
    if prev_lower in ("l", "n", "r"):
        return True
    # Check for diphthongs ending in front vowel
    if prev_lower.endswith(("ei", "ai", "eu", "äu", "ie", "ey", "ay")):
        return True
    return False


def normalize_to_kokoro(phonemes: str, use_tie_replacement: bool = False) -> str:
    """Normalize German phonemes to Kokoro-compatible format.

    Converts combining diacritics to precomposed characters that exist
    in the Kokoro TTS vocabulary.

    Args:
        phonemes: IPA phoneme string potentially containing combining diacritics.
        use_tie_replacement: If True, replace tie characters (͡) with special
        phonmes. Default is False.

    Returns:
        Normalized phoneme string compatible with Kokoro vocab.
    """
    if not phonemes:
        return phonemes
    # Replace tie characters (U+0361) with special phonemes if requested
    if use_tie_replacement:
        phonemes = phonemes.replace("͡", "^")
        phonemes = phonemes.replace("a^ɪ", "I")
        phonemes = phonemes.replace("a^ʊ", "W")
        phonemes = phonemes.replace("d^z", "ʣ")
        phonemes = phonemes.replace("d^ʒ", "ʤ")
        phonemes = phonemes.replace("e^ɪ", "A")
        phonemes = phonemes.replace("o^ʊ", "O")
        phonemes = phonemes.replace("ə^ʊ", "Q")
        phonemes = phonemes.replace("s^s", "S")
        phonemes = phonemes.replace("t^s", "ʦ")
        phonemes = phonemes.replace("t^ʃ", "ʧ")
        phonemes = phonemes.replace("ɔ^ɪ", "Y")

    # Remove non-syllabic markers from diphthongs (U+032F)
    # The diphthongs work without this marker in Kokoro
    phonemes = phonemes.replace("\u032f", "")  # COMBINING INVERTED BREVE BELOW

    # Remove syllabic consonant marker (U+0329)
    # Syllabic consonants like n̩, l̩, m̩ work without this marker in Kokoro
    phonemes = phonemes.replace("\u0329", "")  # COMBINING VERTICAL LINE BELOW

    # Replace IPA characters not in Kokoro vocab with closest equivalents
    phonemes = phonemes.replace("ʏ", "y")  # LATIN SMALL CAPITAL Y -> lowercase y

    return phonemes


[docs] class GermanG2P(G2PBase): """German G2P converter using dictionary lookup with fallback options. This class provides grapheme-to-phoneme conversion for German text using a large dictionary (738k+ entries) with fallback to espeak-ng or goruut for out-of-vocabulary words and phonological rules. Example: >>> g2p = GermanG2P() >>> tokens = g2p("Guten Tag") >>> for token in tokens: ... print(f"{token.text} -> {token.phonemes}") """
[docs] def __init__( self, language: str = "de-de", use_espeak_fallback: bool = True, use_goruut_fallback: bool = False, use_spacy: bool = False, spacy_model: str = "de_core_news_sm", use_lexicon: bool = True, strip_stress: bool = True, load_silver: bool = True, load_gold: bool = True, version: str = "1.0", expand_abbreviations: bool = True, enable_context_detection: bool = True, **kwargs: Any, ) -> None: """Initialize the German G2P converter. Args: language: Language code (default: 'de-de'). use_espeak_fallback: Whether to use espeak for OOV words. use_goruut_fallback: Whether to use goruut for OOV words. use_spacy: Whether to use spaCy for tokenization and POS tagging. Defaults to False to preserve legacy behavior and avoid requiring spaCy model downloads unless explicitly requested. spacy_model: spaCy German model package to load when use_spacy=True (e.g., "de_core_news_sm", "de_core_news_md", "de_core_news_lg"). use_lexicon: Whether to use dictionary lookup (default: True). strip_stress: Whether to remove stress markers from lexicon output. load_silver: If True, load silver tier dictionary if available. Currently German only has gold dictionary, so this parameter is reserved for future use and consistency with English. Defaults to True for consistency. load_gold: If True, load gold tier dictionary. Defaults to True for maximum quality and coverage. Set to False when ultra-fast initialization is needed. expand_abbreviations: Whether to expand abbreviations (Prof. → Professor). enable_context_detection: Context-aware abbreviation expansion. Raises: ValueError: If both use_espeak_fallback and use_goruut_fallback are True. """ # Validate mutual exclusion if use_espeak_fallback and use_goruut_fallback: raise ValueError( "Cannot use both espeak and goruut fallback simultaneously. " "Please set only one of use_espeak_fallback or " "use_goruut_fallback to True." ) super().__init__( language=language, use_espeak_fallback=use_espeak_fallback, use_goruut_fallback=use_goruut_fallback, ) self.version = version self._lexicon: GermanLexicon | None = None self._fallback: Any = None self._strip_stress = strip_stress self.use_spacy = use_spacy self.spacy_model = spacy_model # Initialize spaCy and tokenizers (lazy) self._nlp: object | None = None self._regex_tokenizer: RegexTokenizer | None = None self._spacy_tokenizer: SpacyTokenizer | None = None # Initialize normalizer from kokorog2p.de.normalizer import GermanNormalizer self._normalizer = GermanNormalizer( track_changes=False, expand_abbreviations=expand_abbreviations, enable_context_detection=enable_context_detection, ) if use_lexicon: try: from kokorog2p.de.lexicon import GermanLexicon self._lexicon = GermanLexicon( strip_stress=strip_stress, load_silver=load_silver, load_gold=load_gold, ) except ImportError: pass # Initialize fallback (lazy) if use_goruut_fallback: try: from kokorog2p.de.fallback import GermanGoruutFallback self._fallback = GermanGoruutFallback() except ImportError: pass elif use_espeak_fallback: try: from kokorog2p.de.fallback import GermanEspeakFallback self._fallback = GermanEspeakFallback() except ImportError: pass
@property def nlp(self) -> object: """Lazily initialize spaCy.""" if self._nlp is None: import spacy name = self.spacy_model if not spacy.util.is_package(name): spacy.cli.download(name) # type: ignore[attr-defined] self._nlp = spacy.load(name, enable=["tok2vec", "tagger"]) return self._nlp @property def regex_tokenizer(self) -> RegexTokenizer: """Lazily initialize the regex tokenizer.""" if self._regex_tokenizer is None: self._regex_tokenizer = RegexTokenizer( track_positions=True, use_bracket_matching=True, lang=self.language, ) return self._regex_tokenizer @property def spacy_tokenizer(self) -> SpacyTokenizer: """Lazily initialize the spaCy tokenizer.""" if self._spacy_tokenizer is None: self._spacy_tokenizer = SpacyTokenizer( nlp=self.nlp, track_positions=True, use_bracket_matching=True, lang=self.language, ) return self._spacy_tokenizer
[docs] def __call__(self, text: str) -> list[GToken]: """Convert text to a list of tokens with phonemes. Args: text: Input text to convert. Returns: List of GToken objects with phonemes assigned. """ if not text or not text.strip(): return [] # Normalize text (expand abbreviations, normalize quotes, etc.) text = self._normalizer(text) tokens = ( self._tokenize_spacy(text) if self.use_spacy else self._tokenize_simple(text) ) for token in tokens: word = token.text # Handle punctuation if not any(c.isalnum() for c in word): token.phonemes = self._get_punct_phonemes(word) token.set("rating", 4) continue # Try lexicon first phonemes = None if self._lexicon: phonemes = self._lexicon.lookup(word, token.tag) if phonemes: token.phonemes = normalize_to_kokoro(phonemes) token.set("rating", 5) # Dictionary lookup = highest rating # Fallback to espeak or goruut if not phonemes and self._fallback: fallback_result = self._fallback(word) phonemes = fallback_result[0] if phonemes: token.phonemes = phonemes token.set("rating", 3) # Fallback # Fallback to rules if not phonemes: phonemes = self._word_to_phonemes(word) if phonemes: token.phonemes = normalize_to_kokoro(phonemes) token.set("rating", 2) # Rule-based if not phonemes: token.phonemes = "?" token.set("rating", 0) ensure_gtoken_positions(tokens, text) return tokens
def _tokenize_spacy(self, text: str) -> list[GToken]: """Tokenize text using spaCy.""" processing_tokens = self.spacy_tokenizer.tokenize(text) return [ptoken.to_gtoken() for ptoken in processing_tokens] def _tokenize_simple(self, text: str) -> list[GToken]: """Tokenize text using regex tokenizer.""" processing_tokens = self.regex_tokenizer.tokenize(text) return [ptoken.to_gtoken() for ptoken in processing_tokens] def _word_to_phonemes(self, word: str) -> str: """Convert a single word to phonemes using German rules. Args: word: Word to convert. Returns: Phoneme string in IPA. """ text = word.lower() result: list[str] = [] i = 0 n = len(text) while i < n: matched = False # Try to match multi-character sequences first (longest match) for length in (4, 3, 2): if i + length <= n: chunk = text[i : i + length] # Special handling for 'ch' if chunk == "ch" and length == 2: # ich-Laut vs ach-Laut if i == 0: # Word-initial ch (loan words) -> [ç] or [k] result.append("ç") elif i > 0 and _is_front_vowel_context(text[i - 1]): # After front vowels and l, n, r -> ich-Laut [ç] result.append("ç") else: # After back vowels a, o, u, au -> ach-Laut [x] result.append("x") i += 2 matched = True break # Special handling for word-initial sp, st if chunk in ("sp", "st") and length == 2: if i == 0: # Word-initial sp/st -> [ʃp]/[ʃt] result.append("ʃ") result.append(chunk[1]) i += 2 matched = True break # Not word-initial, handle normally continue # Special handling for 'ig' at word end if chunk == "ig" and length == 2 and i + 2 == n: result.append("ɪ") result.append("ç") # -ig -> [ɪç] at word end i += 2 matched = True break # Check if chunk is in IPA mappings if chunk in IPA: result.append(IPA[chunk]) i += length matched = True break if matched: continue # Single character char = text[i] # Special handling for vowels - check for length if char in "aeiouäöü": phoneme = self._get_vowel_phoneme(text, i) result.append(phoneme) i += 1 continue # Special handling for 's' if char == "s": # Word-final or before unvoiced consonant -> [s] if i == n - 1: result.append("s") elif i + 1 < n and text[i + 1] in "ptk": result.append("s") else: # Before vowel -> [z] result.append("z") i += 1 continue # Special handling for 'v' if char == "v": # In some loan words, 'v' is [v], but default is [f] result.append("f") i += 1 continue # Special handling for 'r' if char == "r": # Vocalized r at end of syllable/word often becomes [ɐ] # For simplicity, use [ʁ] everywhere result.append("ʁ") i += 1 continue # Check IPA mapping if char in IPA: result.append(IPA[char]) elif char.isalpha(): # Unknown letter, keep as-is or use placeholder result.append(char) # Skip non-alphabetic characters i += 1 # Apply final devoicing result = self._apply_final_devoicing(result) return "".join(result) def _get_vowel_phoneme(self, text: str, pos: int) -> str: """Determine the correct vowel phoneme based on context. German vowel length is determined by syllable structure: - Long in open syllables (single consonant + vowel follows) - Long before single consonant at word end in many cases - Short before consonant clusters - Schwa (ə) in unstressed endings like -e, -en, -el, -er Args: text: The full word text. pos: Position of the vowel in the word. Returns: IPA phoneme for the vowel. """ char = text[pos] # Short vowel mappings short_vowels: dict[str, str] = { "a": "a", "e": "ɛ", "i": "ɪ", "o": "ɔ", "u": "ʊ", "ä": "ɛ", "ö": "œ", "ü": "ʏ", } # Long vowel mappings long_vowels: dict[str, str] = { "a": "aː", "e": "eː", "i": "iː", "o": "oː", "u": "uː", "ä": "ɛː", "ö": "øː", "ü": "yː", } # Check what follows remaining = text[pos + 1 :] # Special handling for 'e' - check for schwa if char == "e": # Word-final -e -> schwa if not remaining: return "ə" # -en, -el, -er at word end -> schwa if remaining in ("n", "l", "r", "m", "ns", "ln", "rn", "ls", "rs"): return "ə" # -end, -ent, -ens at word end (but not stressed like 'Trend') if remaining in ("nd", "nt", "ns") and pos > 0: return "ə" # Before 'h' + vowel or word end -> long if remaining.startswith("h") and ( len(remaining) == 1 or remaining[1:2] in "aeiouäöü" ): return long_vowels.get(char, short_vowels.get(char, char)) # Word-final vowel -> usually short except in some words if not remaining: return short_vowels.get(char, char) # Before single consonant at word end -> often long if len(remaining) == 1 and remaining[0] in "bcdfghjklmnpqrstvwxz": # Common pattern: V + single C at end = long vowel return long_vowels.get(char, short_vowels.get(char, char)) # Before 'ch', 'ß' -> depends on word, often long if remaining.startswith(("ch", "ß")): return long_vowels.get(char, short_vowels.get(char, char)) # Before single consonant + vowel (open syllable) -> long if ( len(remaining) >= 2 and remaining[0] in "bcdfghjklmnpqrstvwxz" and remaining[0] not in "ck" # ck indicates short vowel and remaining[1] in "aeiouäöü" ): # But 'sch' is one sound, not cluster if not remaining.startswith("sch"): return long_vowels.get(char, short_vowels.get(char, char)) # Before consonant cluster -> short if ( len(remaining) >= 2 and remaining[0] in "bcdfghjklmnpqrstvwxz" and remaining[1] in "bcdfghjklmnpqrstvwxz" ): return short_vowels.get(char, char) # Default to short return short_vowels.get(char, char) def _apply_final_devoicing(self, phonemes: list[str]) -> list[str]: """Apply German final obstruent devoicing (Auslautverhärtung). Only applies to the final consonant cluster of the word. Args: phonemes: List of phoneme strings. Returns: Modified list with final devoicing applied. """ if not phonemes: return phonemes vowels = frozenset( [ "a", "aː", "e", "eː", "ɛ", "ɛː", "i", "iː", "ɪ", "o", "oː", "ɔ", "u", "uː", "ʊ", "y", "yː", "ʏ", "ø", "øː", "œ", "ə", "ɐ", "aɪ", "aʊ", "ɔʏ", ] ) # Find the last consonant cluster (after the last vowel) last_vowel_idx = -1 for i in range(len(phonemes) - 1, -1, -1): if phonemes[i] in vowels: last_vowel_idx = i break # Devoice all voiced obstruents after the last vowel for i in range(last_vowel_idx + 1, len(phonemes)): phone = phonemes[i] if phone in VOICED_TO_UNVOICED: phonemes[i] = VOICED_TO_UNVOICED[phone] return phonemes @staticmethod def _get_punct_phonemes(text: str) -> str: """Get phonemes for punctuation tokens. Only includes punctuation that exists in the Kokoro vocabulary. """ # Punctuation marks that exist in Kokoro vocab # See kokorog2p/data/kokoro_config.json puncts = frozenset(';:,.!?"()') return "".join(c for c in text if c in puncts)
[docs] def lookup(self, word: str, tag: str | None = None) -> str | None: """Look up a word in the dictionary. Args: word: The word to look up. tag: Optional POS tag (not used for German). Returns: Phoneme string if found, None otherwise. """ if self._lexicon: return self._lexicon.lookup(word) return None
[docs] def phonemize(self, text: str) -> str: """Convert text to a phoneme string. Args: text: Input text to convert. Returns: Phoneme string. """ tokens = self(text) return " ".join(t.phonemes or "" for t in tokens if t.phonemes)
[docs] def get_target_model(self) -> str: """Get the target Kokoro model variant for this G2P instance. Returns: Model identifier: version string ("1.1" or "1.0"). """ return self.version