Source code for kokorog2p.ko.g2p

"""Korean G2P (Grapheme-to-Phoneme) converter.

This module provides Korean text to phoneme conversion using MeCab for morphological
analysis and custom phonological rules based on Korean Standard Pronunciation.

Based on g2pK by kyubyong: https://github.com/kyubyong/g2pK

Copyright 2025 kokorog2p contributors
Licensed under the Apache License, Version 2.0
"""

from kokorog2p.base import G2PBase
from kokorog2p.token import GToken
from kokorog2p.tokenization import ensure_gtoken_positions

from .jamo_to_ipa import jamo_to_ipa


[docs] class KoreanG2P(G2PBase): """Korean G2P using MeCab and Korean phonological rules. This class converts Korean text to phonemes using: 1. Idiom/abbreviation replacement 2. English to Hangul conversion 3. MeCab POS tagging 4. Number spelling 5. Hangul decomposition 6. Phonological rules application 7. Jamo composition Example: >>> g2p = KoreanG2P() >>> tokens = g2p("안녕하세요") """
[docs] def __init__( self, language: str = "ko", use_espeak_fallback: bool = False, use_goruut_fallback: bool = False, use_spacy: bool = False, spacy_model: str = "ko_core_news_sm", load_silver: bool = True, load_gold: bool = True, use_dict: bool = True, group_vowels: bool = False, to_syl: bool = False, version: str = "1.0", **kwargs, ) -> None: """Initialize the Korean G2P. Args: language: Language code (e.g., 'ko', 'ko-kr'). use_espeak_fallback: Whether to use espeak for unknown words. Not typically used for Korean. Defaults to False. use_goruut_fallback: Whether to use goruut for unknown words. Not typically used for Korean. Defaults to False. use_spacy: Reserved for API consistency. Korean uses g2pK backend for tokenization and phonemization. spacy_model: Reserved for API consistency when use_spacy is enabled. load_silver: Reserved for API consistency. Korean doesn't use dictionary tiers. Defaults to True. load_gold: Reserved for API consistency. Korean doesn't use dictionary tiers. Defaults to True. use_dict: Whether to use MeCab dictionary for POS tagging. Defaults to True. If False, skips MeCab annotation. group_vowels: If True, merge similar vowels (e.g., ㅐ->ㅔ). Defaults to False. to_syl: If True, compose jamo back to syllables. Defaults to False (returns decomposed jamo). **kwargs: Additional arguments. """ super().__init__( language=language, use_espeak_fallback=use_espeak_fallback, use_goruut_fallback=use_goruut_fallback, ) self.version = version self.use_spacy = use_spacy self.spacy_model = spacy_model self.load_silver = load_silver self.load_gold = load_gold self.use_dict = use_dict self.group_vowels = group_vowels self.to_syl = to_syl self._g2pk_instance = None
@property def g2pk(self): """Lazy initialization of g2pK backend.""" if self._g2pk_instance is None: from .g2pk import G2p self._g2pk_instance = G2p() return self._g2pk_instance
[docs] def __call__(self, text: str) -> list[GToken]: """Convert Korean text to tokens with phonemes. Args: text: Input Korean text to convert. Returns: List of GToken objects with phonemes. """ if not text or not text.strip(): return [] # Convert to phonemes using g2pK (returns Hangul in phonetic form) hangul_phonemes = self.g2pk( text, descriptive=False, verbose=False, group_vowels=self.group_vowels, to_syl=self.to_syl, use_dict=self.use_dict, ) # Convert jamo to IPA phonemes ipa_phonemes = jamo_to_ipa(hangul_phonemes) if hangul_phonemes else None # Create a single token with the phoneme string token = GToken( text=text, tag="KO", whitespace="", phonemes=ipa_phonemes if ipa_phonemes else None, ) token.rating = "ko" if ipa_phonemes else None tokens = [token] ensure_gtoken_positions(tokens, text) return tokens
[docs] def lookup(self, word: str, tag: str | None = None) -> str | None: """Look up a Korean word and return its phonetic representation. Args: word: The word to look up. tag: Optional POS tag (not used in Korean G2P). Returns: Phoneme string or None if empty. """ if not word or not word.strip(): return None # Use g2pK to convert the word (returns Hangul in phonetic form) hangul_phonemes = self.g2pk( word, descriptive=False, verbose=False, group_vowels=self.group_vowels, to_syl=self.to_syl, use_dict=self.use_dict, ) # Convert jamo to IPA phonemes ipa_phonemes = jamo_to_ipa(hangul_phonemes) if hangul_phonemes else None return ipa_phonemes if ipa_phonemes else None
def _phonemize_internal(self, text: str) -> tuple[str, list[GToken] | None]: """Internal phonemization logic. Args: text: Input text. Returns: Tuple of (phoneme_string, token_list). """ # Convert to phonemes using g2pK (returns Hangul in phonetic form) hangul_phonemes = self.g2pk( text, descriptive=False, verbose=False, group_vowels=self.group_vowels, to_syl=self.to_syl, use_dict=self.use_dict, ) # Convert jamo to IPA phonemes ipa_phonemes = jamo_to_ipa(hangul_phonemes) if hangul_phonemes else "" # Create a token token = GToken( text=text, tag="KO", whitespace="", phonemes=ipa_phonemes if ipa_phonemes else None, ) token.rating = "ko" if ipa_phonemes else None return ipa_phonemes, [token] if ipa_phonemes else None
[docs] def get_target_model(self) -> str: """Get the target Kokoro model variant for this G2P instance. Returns: Model identifier: version string ("1.1" or "1.0"). """ return self.version