Source code for kokorog2p.zh.g2p

"""Chinese G2P (Grapheme-to-Phoneme) converter.

This module provides Chinese text to phoneme conversion using pypinyin
for pinyin extraction and custom IPA mapping for phoneme generation.

Based on misaki's Chinese implementation.

Copyright 2025 kokorog2p contributors
Licensed under the Apache License, Version 2.0
"""

import re

from kokorog2p.base import G2PBase
from kokorog2p.token import GToken
from kokorog2p.tokenization import ensure_gtoken_positions


[docs] class ChineseG2P(G2PBase): """Chinese G2P using pypinyin and IPA transcription. This class converts Chinese text to IPA phonemes using: 1. Jieba for word segmentation 2. pypinyin for pinyin extraction 3. Custom pinyin-to-IPA mapping Example: >>> g2p = ChineseG2P() >>> tokens = g2p("你好世界") """
[docs] def __init__( self, language: str = "zh", use_espeak_fallback: bool = True, use_spacy: bool = False, spacy_model: str = "zh_core_web_sm", version: str = "1.1", unk: str = "", en_callable=None, load_silver: bool = True, load_gold: bool = True, **kwargs, ) -> None: """Initialize the Chinese G2P. Args: language: Language code (e.g., 'zh', 'zh-cn'). use_espeak_fallback: Whether to use espeak for English words. use_spacy: Reserved for API consistency. Chinese uses jieba/pypinyin and custom frontend pipelines for tokenization and phonemization. spacy_model: Reserved for API consistency when use_spacy is enabled. version: Version of the G2P ("1.0" for base model, "1.1" for ZHFrontend multilingual). Default: "1.1". unk: Unknown token placeholder. en_callable: Callable for English word phonemization. load_silver: If True, load silver tier dictionary if available. Currently Chinese uses pypinyin system, so this parameter is reserved for future use and consistency. Defaults to True for consistency. load_gold: If True, load gold tier dictionary if available. Currently Chinese uses pypinyin system, so this parameter is reserved for future use and consistency. Defaults to True for consistency. **kwargs: Additional arguments. """ super().__init__(language=language, use_espeak_fallback=use_espeak_fallback) self.version = version self.use_spacy = use_spacy self.spacy_model = spacy_model self.unk = unk self.en_callable = en_callable self.load_silver = load_silver self.load_gold = load_gold self._frontend = None self._jieba = None self._cn2an = None self._pypinyin = None self._transcription = None
@property def frontend(self): """Lazy initialization of ZHFrontend for version 1.1.""" if self._frontend is None and self.version == "1.1": from kokorog2p.zh.frontend import ZHFrontend self._frontend = ZHFrontend(unk=self.unk) return self._frontend @property def jieba(self): """Lazy import of jieba.""" if self._jieba is None: import jieba self._jieba = jieba return self._jieba @property def cn2an(self): """Lazy import of cn2an.""" if self._cn2an is None: import cn2an self._cn2an = cn2an return self._cn2an @property def pypinyin(self): """Lazy import of pypinyin.""" if self._pypinyin is None: from pypinyin import Style, lazy_pinyin self._pypinyin = {"lazy_pinyin": lazy_pinyin, "Style": Style} return self._pypinyin @property def transcription(self): """Lazy import of transcription module.""" if self._transcription is None: from kokorog2p.zh.transcription import pinyin_to_ipa self._transcription = pinyin_to_ipa return self._transcription
[docs] @staticmethod def retone(p: str) -> str: """Convert tone markers to simpler format.""" p = p.replace("˧˩˧", "↓") # third tone p = p.replace("˧˥", "↗") # second tone p = p.replace("˥˩", "↘") # fourth tone p = p.replace("˥", "→") # first tone p = p.replace(chr(635) + chr(809), "ɨ").replace(chr(633) + chr(809), "ɨ") return p
[docs] def py2ipa(self, py: str) -> str: """Convert pinyin to IPA.""" return "".join(self.retone(p) for p in self.transcription(py)[0])
[docs] def word2ipa(self, w: str) -> str: """Convert a Chinese word to IPA via pinyin.""" lazy_pinyin = self.pypinyin["lazy_pinyin"] Style = self.pypinyin["Style"] pinyins = lazy_pinyin(w, style=Style.TONE3, neutral_tone_with_five=True) return "".join(self.py2ipa(py) for py in pinyins)
[docs] @staticmethod def map_punctuation(text: str) -> str: """Convert Chinese punctuation to ASCII equivalents.""" text = text.replace("、", ", ").replace(",", ", ") text = text.replace("。", ". ").replace(".", ". ") text = text.replace("!", "! ") text = text.replace(":", ": ") text = text.replace(";", "; ") text = text.replace("?", "? ") text = text.replace("«", ' "').replace("»", '" ') text = text.replace("《", ' "').replace("》", '" ') text = text.replace("「", ' "').replace("」", '" ') text = text.replace("【", ' "').replace("】", '" ') text = text.replace("(", " (").replace(")", ") ") return text.strip()
[docs] def legacy_call(self, text: str) -> str: """Legacy phonemization using jieba and pypinyin directly.""" is_zh = bool(re.match(r"[\u4E00-\u9FFF]", text[0])) if text else False result = "" for segment in re.findall(r"[\u4E00-\u9FFF]+|[^\u4E00-\u9FFF]+", text): if is_zh: words = self.jieba.lcut(segment, cut_all=False) segment = " ".join(self.word2ipa(w) for w in words) result += segment is_zh = not is_zh return result.replace(chr(815), "")
[docs] def __call__(self, text: str) -> list[GToken]: """Convert text to tokens with phonemes. Args: text: Input text to convert. Returns: List of GToken objects with phonemes. """ if not text or not text.strip(): return [] # Phonemize using the internal method phonemes, _ = self._phonemize_internal(text) # Create a single token for now (Chinese segmentation is complex) # The frontend returns detailed tokens if needed token = GToken( text=text, tag="X", whitespace="", phonemes=phonemes if phonemes else None, ) token.rating = "zh" if phonemes else None tokens = [token] ensure_gtoken_positions(tokens, text) return tokens
def _phonemize_internal( self, text: str, en_callable=None ) -> tuple[str, list | None]: """Internal phonemization logic. Args: text: Input text. en_callable: Optional callable for English words. Returns: Tuple of (phoneme_string, token_list). """ if not text.strip(): return "", None # Convert Arabic numerals to Chinese text = self.cn2an.transform(text, "an2cn") # Map punctuation text = self.map_punctuation(text) if self.version == "1.0": return self.legacy_call(text), None # Use ZHFrontend for version 1.1 en_callable = self.en_callable if en_callable is None else en_callable segments = [] for en, zh in re.findall( r"([A-Za-z \'-]*[A-Za-z][A-Za-z \'-]*)|([^A-Za-z]+)", text ): en, zh = en.strip(), zh.strip() if zh: result, _ = self.frontend(zh) segments.append(result) elif en_callable is None: segments.append(self.unk if self.unk else "") else: segments.append(en_callable(en)) return " ".join(segments), None
[docs] def lookup(self, word: str, tag: str | None = None) -> str | None: """Look up a word's phonemes. Args: word: The word to look up. tag: Optional POS tag (ignored for Chinese). Returns: Phoneme string or None. """ if not word: return None return self.word2ipa(word)
[docs] def phonemize(self, text: str) -> str: """Convert text to phonemes. Args: text: Input text to convert. Returns: Phoneme string. """ result, _ = self._phonemize_internal(text) return result
def __repr__(self) -> str: return f"ChineseG2P(language={self.language!r}, version={self.version!r})"
[docs] def get_target_model(self) -> str: """Get the target Kokoro model variant for this G2P instance. Returns: Model identifier: "1.1" for version 1.1, "1.0" otherwise. """ return self.version