Source code for kokorog2p.zh.g2p

"""Chinese G2P (Grapheme-to-Phoneme) converter.

This module provides Chinese text to phoneme conversion using pypinyin
for pinyin extraction and custom IPA mapping for phoneme generation.

Based on misaki's Chinese implementation.

Copyright 2025 kokorog2p contributors
Licensed under the Apache License, Version 2.0
"""

import re

from kokorog2p.base import G2PBase
from kokorog2p.token import GToken
from kokorog2p.tokenization import ensure_gtoken_positions



[docs]
class ChineseG2P(G2PBase):
    """Chinese G2P using pypinyin and IPA transcription.

    This class converts Chinese text to IPA phonemes using:
    1. Jieba for word segmentation
    2. pypinyin for pinyin extraction
    3. Custom pinyin-to-IPA mapping

    Example:
        >>> g2p = ChineseG2P()
        >>> tokens = g2p("你好世界")
    """


[docs]
    def __init__(
        self,
        language: str = "zh",
        use_espeak_fallback: bool = True,
        use_spacy: bool = False,
        spacy_model: str = "zh_core_web_sm",
        version: str = "1.1",
        unk: str = "",
        en_callable=None,
        load_silver: bool = True,
        load_gold: bool = True,
        **kwargs,
    ) -> None:
        """Initialize the Chinese G2P.

        Args:
            language: Language code (e.g., 'zh', 'zh-cn').
            use_espeak_fallback: Whether to use espeak for English words.
            use_spacy: Reserved for API consistency. Chinese uses jieba/pypinyin
                and custom frontend pipelines for tokenization and phonemization.
            spacy_model: Reserved for API consistency when use_spacy is enabled.
            version: Version of the G2P ("1.0" for base model,
                "1.1" for ZHFrontend multilingual). Default: "1.1".
            unk: Unknown token placeholder.
            en_callable: Callable for English word phonemization.
            load_silver: If True, load silver tier dictionary if available.
                Currently Chinese uses pypinyin system, so this parameter
                is reserved for future use and consistency.
                Defaults to True for consistency.
            load_gold: If True, load gold tier dictionary if available.
                Currently Chinese uses pypinyin system, so this parameter
                is reserved for future use and consistency.
                Defaults to True for consistency.
            **kwargs: Additional arguments.
        """
        super().__init__(language=language, use_espeak_fallback=use_espeak_fallback)
        self.version = version
        self.use_spacy = use_spacy
        self.spacy_model = spacy_model
        self.unk = unk
        self.en_callable = en_callable
        self.load_silver = load_silver
        self.load_gold = load_gold
        self._frontend = None
        self._jieba = None
        self._cn2an = None
        self._pypinyin = None
        self._transcription = None


    @property
    def frontend(self):
        """Lazy initialization of ZHFrontend for version 1.1."""
        if self._frontend is None and self.version == "1.1":
            from kokorog2p.zh.frontend import ZHFrontend

            self._frontend = ZHFrontend(unk=self.unk)
        return self._frontend

    @property
    def jieba(self):
        """Lazy import of jieba."""
        if self._jieba is None:
            import jieba

            self._jieba = jieba
        return self._jieba

    @property
    def cn2an(self):
        """Lazy import of cn2an."""
        if self._cn2an is None:
            import cn2an

            self._cn2an = cn2an
        return self._cn2an

    @property
    def pypinyin(self):
        """Lazy import of pypinyin."""
        if self._pypinyin is None:
            from pypinyin import Style, lazy_pinyin

            self._pypinyin = {"lazy_pinyin": lazy_pinyin, "Style": Style}
        return self._pypinyin

    @property
    def transcription(self):
        """Lazy import of transcription module."""
        if self._transcription is None:
            from kokorog2p.zh.transcription import pinyin_to_ipa

            self._transcription = pinyin_to_ipa
        return self._transcription


[docs]
    @staticmethod
    def retone(p: str) -> str:
        """Convert tone markers to simpler format."""
        p = p.replace("˧˩˧", "↓")  # third tone
        p = p.replace("˧˥", "↗")  # second tone
        p = p.replace("˥˩", "↘")  # fourth tone
        p = p.replace("˥", "→")  # first tone
        p = p.replace(chr(635) + chr(809), "ɨ").replace(chr(633) + chr(809), "ɨ")
        return p



[docs]
    def py2ipa(self, py: str) -> str:
        """Convert pinyin to IPA."""
        return "".join(self.retone(p) for p in self.transcription(py)[0])



[docs]
    def word2ipa(self, w: str) -> str:
        """Convert a Chinese word to IPA via pinyin."""
        lazy_pinyin = self.pypinyin["lazy_pinyin"]
        Style = self.pypinyin["Style"]
        pinyins = lazy_pinyin(w, style=Style.TONE3, neutral_tone_with_five=True)
        return "".join(self.py2ipa(py) for py in pinyins)



[docs]
    @staticmethod
    def map_punctuation(text: str) -> str:
        """Convert Chinese punctuation to ASCII equivalents."""
        text = text.replace("、", ", ").replace("，", ", ")
        text = text.replace("。", ". ").replace("．", ". ")
        text = text.replace("！", "! ")
        text = text.replace("：", ": ")
        text = text.replace("；", "; ")
        text = text.replace("？", "? ")
        text = text.replace("«", ' "').replace("»", '" ')
        text = text.replace("《", ' "').replace("》", '" ')
        text = text.replace("「", ' "').replace("」", '" ')
        text = text.replace("【", ' "').replace("】", '" ')
        text = text.replace("（", " (").replace("）", ") ")
        return text.strip()



[docs]
    def legacy_call(self, text: str) -> str:
        """Legacy phonemization using jieba and pypinyin directly."""
        is_zh = bool(re.match(r"[\u4E00-\u9FFF]", text[0])) if text else False
        result = ""
        for segment in re.findall(r"[\u4E00-\u9FFF]+|[^\u4E00-\u9FFF]+", text):
            if is_zh:
                words = self.jieba.lcut(segment, cut_all=False)
                segment = " ".join(self.word2ipa(w) for w in words)
            result += segment
            is_zh = not is_zh
        return result.replace(chr(815), "")



[docs]
    def __call__(self, text: str) -> list[GToken]:
        """Convert text to tokens with phonemes.

        Args:
            text: Input text to convert.

        Returns:
            List of GToken objects with phonemes.
        """
        if not text or not text.strip():
            return []

        # Phonemize using the internal method
        phonemes, _ = self._phonemize_internal(text)

        # Create a single token for now (Chinese segmentation is complex)
        # The frontend returns detailed tokens if needed
        token = GToken(
            text=text,
            tag="X",
            whitespace="",
            phonemes=phonemes if phonemes else None,
        )
        token.rating = "zh" if phonemes else None
        tokens = [token]
        ensure_gtoken_positions(tokens, text)
        return tokens


    def _phonemize_internal(
        self, text: str, en_callable=None
    ) -> tuple[str, list | None]:
        """Internal phonemization logic.

        Args:
            text: Input text.
            en_callable: Optional callable for English words.

        Returns:
            Tuple of (phoneme_string, token_list).
        """
        if not text.strip():
            return "", None

        # Convert Arabic numerals to Chinese
        text = self.cn2an.transform(text, "an2cn")

        # Map punctuation
        text = self.map_punctuation(text)

        if self.version == "1.0":
            return self.legacy_call(text), None

        # Use ZHFrontend for version 1.1
        en_callable = self.en_callable if en_callable is None else en_callable
        segments = []
        for en, zh in re.findall(
            r"([A-Za-z \'-]*[A-Za-z][A-Za-z \'-]*)|([^A-Za-z]+)", text
        ):
            en, zh = en.strip(), zh.strip()
            if zh:
                result, _ = self.frontend(zh)
                segments.append(result)
            elif en_callable is None:
                segments.append(self.unk if self.unk else "")
            else:
                segments.append(en_callable(en))

        return " ".join(segments), None


[docs]
    def lookup(self, word: str, tag: str | None = None) -> str | None:
        """Look up a word's phonemes.

        Args:
            word: The word to look up.
            tag: Optional POS tag (ignored for Chinese).

        Returns:
            Phoneme string or None.
        """
        if not word:
            return None
        return self.word2ipa(word)



[docs]
    def phonemize(self, text: str) -> str:
        """Convert text to phonemes.

        Args:
            text: Input text to convert.

        Returns:
            Phoneme string.
        """
        result, _ = self._phonemize_internal(text)
        return result


    def __repr__(self) -> str:
        return f"ChineseG2P(language={self.language!r}, version={self.version!r})"


[docs]
    def get_target_model(self) -> str:
        """Get the target Kokoro model variant for this G2P instance.

        Returns:
            Model identifier: "1.1" for version 1.1, "1.0" otherwise.
        """
        return self.version