Source code for kokorog2p.cs.g2p

"""Czech G2P (Grapheme-to-Phoneme) converter.

Grapheme to Phoneme for Czech language.
Originally developed by Richard Mazur:
https://github.com/essare-rimaz/grapheme_to_phoneme_CZ/blob/main/server.r

Later converted to Python by Miroslav Suchy <msuchy@redhat.com> with
assistance of AI. And with permission of Richard released under Apache-2.0
license.

Adapted for kokorog2p architecture.

Czech Phonology reference:
https://cs.wikipedia.org/wiki/Fonologie_%C4%8De%C5%A1tiny
"""

import re
from typing import Any, Final

from kokorog2p.base import G2PBase
from kokorog2p.cs.normalizer import CzechNormalizer
from kokorog2p.token import GToken
from kokorog2p.tokenization import ensure_gtoken_positions

# =============================================================================
# Czech Phoneme Mappings
# =============================================================================

# IPA mappings for Czech graphemes
IPA: Final[dict[str, str]] = {
    "a": "a",
    "á": "aː",
    "b": "b",
    "c": "t͡s",
    "č": "t͡ʃ",
    "d": "d",
    "ď": "ɟ",
    "e": "ɛ",
    "é": "ɛː",
    "ě": "ě",
    "f": "f",
    "g": "ɡ",
    "h": "ɦ",
    "ch": "x",
    "i": "ɪ",
    "í": "iː",
    "j": "j",
    "k": "k",
    "l": "l",
    "m": "m",
    "n": "n",
    "ň": "ň",
    "o": "o",
    "ó": "oː",
    "p": "p",
    "q": "k",
    "r": "r",
    "s": "s",
    "š": "ʃ",
    "t": "t",
    "ť": "c",
    "u": "u",
    "ú": "uː",
    "ů": "uː",
    "v": "v",
    "w": "w",
    "x": "ks",
    "y": "ɪ",
    "ý": "iː",
    "z": "z",
    "ž": "ʒ",
    "di": "ɟɪ",
    "dí": "ɟiː",
    "dě": "ɟɛ",
    "ti": "cɪ",
    "tí": "ciː",
    "tě": "cɛ",
    "ni": "ɲɪ",
    "ní": "ɲiː",
    "ně": "ɲɛ",
    "mě": "mɲɛ",
    "bě": "bjɛ",
    "pě": "pjɛ",
    "vě": "vjɛ",
    "ts": "t͡s",
    "dz": "d͡z",
    "ie": "ɪjɛ",
    "ia": "ɪja",
    "io": "ɪjo",
    "ř": "r̝",
}

# Temporary representation for processing
TEMP: Final[dict[str, str]] = {
    "a": "a",
    "á": "á",
    "b": "b",
    "c": "c",
    "č": "č",
    "d": "d",
    "ď": "ď",
    "e": "e",
    "é": "é",
    "ě": "ě",
    "f": "f",
    "g": "g",
    "h": "h",
    "ch": "ch",
    "i": "i",
    "í": "í",
    "j": "j",
    "k": "k",
    "l": "l",
    "m": "m",
    "n": "n",
    "ň": "ň",
    "o": "o",
    "ó": "ó",
    "p": "p",
    "q": "q",
    "r": "r",
    "ř": "ř",
    "s": "s",
    "š": "š",
    "t": "t",
    "ť": "ť",
    "u": "u",
    "ú": "ú",
    "ů": "ů",
    "v": "v",
    "w": "w",
    "x": "x",
    "y": "y",
    "ý": "ý",
    "z": "z",
    "ž": "ž",
    "di": "di",
    "dí": "dí",
    "dě": "dě",
    "ti": "ti",
    "tí": "tí",
    "tě": "tě",
    "ni": "ni",
    "ní": "ní",
    "ně": "ně",
    "mě": "mě",
    "bě": "bě",
    "pě": "pě",
    "vě": "vě",
    "dz": "dz",
    "ts": "ts",
    "ie": "ie",
    "ia": "ia",
    "io": "io",
    " ": " ",
}

# Consonant voicing pairs
PAIRED_CONSONANTS: Final[dict[str, str]] = {
    "b": "p",
    "d": "t",
    "ď": "ť",
    "g": "k",
    "v": "f",
    "z": "s",
    "ž": "š",
    "ch": "h",
    "dz": "c",
    "dž": "č",
    "p": "b",
    "t": "d",
    "ť": "ď",
    "k": "g",
    "f": "v",
    "s": "z",
    "š": "ž",
    "h": "ch",
    "c": "dz",
    "č": "dž",
}

PAIRED_UNVOICED: Final[dict[str, str]] = {
    "p": "p",
    "t": "t",
    "ť": "ť",
    "k": "k",
    "f": "f",
    "s": "s",
    "š": "š",
    "ch": "ch",
    "c": "c",
    "č": "č",
}

PAIRED_VOICED: Final[dict[str, str]] = {
    "b": "b",
    "d": "d",
    "ď": "ď",
    "g": "g",
    "v": "v",
    "z": "z",
    "ž": "ž",
    "dz": "dz",
    "dž": "dž",
}

# Consonant + vowel combinations
DTN: Final[dict[str, str]] = {"d": "d", "t": "t", "n": "n"}
DTN_VOCAL: Final[dict[str, str]] = {"í": "í", "i": "i", "ě": "ě"}

MBPV: Final[dict[str, str]] = {"m": "m", "b": "b", "p": "p", "v": "v"}
MBPV_VOCAL: Final[dict[str, str]] = {"ě": "ě"}

# Digraph detection
CH_FIRST: Final[dict[str, str]] = {"c": "c"}
CH_SECOND: Final[dict[str, str]] = {"h": "h"}

TS_FIRST: Final[dict[str, str]] = {"t": "t"}
TS_SECOND: Final[dict[str, str]] = {"s": "s"}

DZ_FIRST: Final[dict[str, str]] = {"d": "d"}
DZ_SECOND: Final[dict[str, str]] = {"z": "z"}

IEIAIO_FIRST: Final[dict[str, str]] = {"i": "i"}
IEIAIO_SECOND: Final[dict[str, str]] = {"e": "e", "a": "a", "o": "o"}


def _indices_where_in(v: list[str | None], keyset: dict[str, str]) -> list[int]:
    """Find indices where values are in keyset."""
    s = set(keyset.keys())
    return [i for i, x in enumerate(v) if x in s]



[docs]
class CzechG2P(G2PBase):
    """Czech G2P converter using rule-based phoneme conversion with fallback options.

    This class provides grapheme-to-phoneme conversion for Czech text
    using phonological rules for voicing assimilation, palatalization,
    and other Czech-specific features, with optional fallback to espeak or goruut.

    Example:
        >>> g2p = CzechG2P()
        >>> tokens = g2p("Dobrý den")
        >>> for token in tokens:
        ...     print(f"{token.text} -> {token.phonemes}")
    """


[docs]
    def __init__(
        self,
        language: str = "cs-cz",
        use_espeak_fallback: bool = False,
        use_goruut_fallback: bool = False,
        unk: str = "?",
        load_silver: bool = True,
        load_gold: bool = True,
        version: str = "1.0",
        expand_abbreviations: bool = True,
        enable_context_detection: bool = True,
        **kwargs: Any,
    ) -> None:
        """Initialize the Czech G2P converter.

        Args:
            language: Language code (default: 'cs-cz').
            use_espeak_fallback: Whether to use espeak for OOV words.
            use_goruut_fallback: Whether to use goruut for OOV words.
            unk: Character to use for unknown characters.
            load_silver: If True, load silver tier dictionary if available.
                Currently Czech uses rule-based G2P, so this parameter
                is reserved for future use and consistency.
                Defaults to True for consistency.
            load_gold: If True, load gold tier dictionary if available.
                Currently Czech uses rule-based G2P, so this parameter
                is reserved for future use and consistency.
                Defaults to True for consistency.
            expand_abbreviations: If True, expand common abbreviations
                (e.g., "Dr." → "Doktor"). Defaults to True.
            enable_context_detection: If True, use context-aware expansion
                for ambiguous abbreviations. Defaults to True.

        Raises:
            ValueError: If both use_espeak_fallback and use_goruut_fallback are True.
        """
        # Validate mutual exclusion
        if use_espeak_fallback and use_goruut_fallback:
            raise ValueError(
                "Cannot use both espeak and goruut fallback simultaneously. "
                "Please set only one of use_espeak_fallback or "
                "use_goruut_fallback to True."
            )

        super().__init__(language=language, use_espeak_fallback=use_espeak_fallback)
        self.version = version
        self.unk = unk
        self.load_silver = load_silver
        self.load_gold = load_gold
        self.use_goruut_fallback = use_goruut_fallback
        self.expand_abbreviations = expand_abbreviations
        self.enable_context_detection = enable_context_detection
        self._fallback: Any = None

        # Initialize normalizer
        self._normalizer = CzechNormalizer(
            expand_abbreviations=expand_abbreviations,
            enable_context_detection=enable_context_detection,
        )

        # Initialize fallback (lazy)
        if use_goruut_fallback:
            try:
                from kokorog2p.cs.fallback import CzechGoruutFallback

                self._fallback = CzechGoruutFallback()
            except ImportError:
                pass
        elif use_espeak_fallback:
            try:
                from kokorog2p.cs.fallback import CzechEspeakFallback

                self._fallback = CzechEspeakFallback()
            except ImportError:
                pass



[docs]
    def __call__(self, text: str) -> list[GToken]:
        """Convert text to a list of tokens with phonemes.

        Args:
            text: Input text to convert.

        Returns:
            List of GToken objects with phonemes assigned.
        """
        if not text or not text.strip():
            return []

        # Apply normalization (abbreviations, temperature, quotes, etc.)
        text = self._normalizer(text)

        tokens: list[GToken] = []

        # Tokenize by whitespace and punctuation
        for match in re.finditer(r"(\w+|[^\w\s]+|\s+)", text, re.UNICODE):
            word = match.group()

            if word.isspace():
                if tokens:
                    tokens[-1].whitespace = word
                continue

            token = GToken(text=word, tag="", whitespace="")

            # Handle punctuation
            if not any(c.isalnum() for c in word):
                token.phonemes = self._get_punct_phonemes(word)
                token.set("rating", 4)
            else:
                # Convert word to phonemes using rules
                phonemes = self._word_to_phonemes(word)

                # Optionally use fallback if enabled
                # (Useful for loan words or foreign abbreviations)
                if not phonemes and self._fallback:
                    fallback_phonemes, rating = self._fallback(word)
                    if fallback_phonemes:
                        phonemes = fallback_phonemes
                        token.set("rating", 2)  # Fallback rating
                    else:
                        token.set("rating", 4)  # Rule-based
                else:
                    token.set("rating", 4)  # Rule-based

                token.phonemes = phonemes if phonemes else self.unk

            tokens.append(token)

        ensure_gtoken_positions(tokens, text)
        return tokens


    def _word_to_phonemes(self, word: str) -> str:  # noqa: C901
        """Convert a single word to phonemes using Czech rules.

        Args:
            word: Word to convert.

        Returns:
            Phoneme string in IPA.
        """
        text = word.lower()
        text_split = list(text)

        result: list[str | None] = []
        for ch in text_split:
            result.append(TEMP.get(ch))  # missing -> None

        # 1) i followed by e/a/o => ie/ia/io
        for x in _indices_where_in(result, IEIAIO_FIRST):
            y = x + 1
            if y < len(result):
                z = result[y]
                if z is not None and z in IEIAIO_SECOND:
                    result[x] = (result[x] or "") + (result[y] or "")
                    result[y] = None

        # 2) d + z => dz
        for x in _indices_where_in(result, DZ_FIRST):
            y = x + 1
            if y < len(result):
                z = result[y]
                if z is not None and z in DZ_SECOND:
                    result[x] = (result[x] or "") + (result[y] or "")
                    result[y] = None

        # 3) t + s => ts
        for x in _indices_where_in(result, TS_FIRST):
            y = x + 1
            if y < len(result):
                z = result[y]
                if z is not None and z in TS_SECOND:
                    result[x] = (result[x] or "") + (result[y] or "")
                    result[y] = None

        # 4) voicing assimilation: unvoiced before voiced => swap with pair
        for x in _indices_where_in(result, PAIRED_UNVOICED):
            y = x + 1
            if y < len(result):
                z = result[y]
                if z is not None and z in PAIRED_VOICED:
                    w = result[x]
                    if w is not None and w in PAIRED_CONSONANTS:
                        result[x] = PAIRED_CONSONANTS[w]

        # 5) voicing assimilation: voiced before unvoiced => swap with pair
        for x in _indices_where_in(result, PAIRED_VOICED):
            y = x + 1
            if y < len(result):
                z = result[y]
                if z is not None and z in PAIRED_UNVOICED:
                    w = result[x]
                    if w is not None and w in PAIRED_CONSONANTS:
                        result[x] = PAIRED_CONSONANTS[w]

        # 6) c + h => ch
        for x in _indices_where_in(result, CH_FIRST):
            y = x + 1
            if y < len(result):
                z = result[y]
                if z is not None and z in CH_SECOND:
                    result[x] = (result[x] or "") + (result[y] or "")
                    result[y] = None

        # 7) d/t/n + (i/í/ě) => di/dí/dě, ti/tí/tě, ni/ní/ně
        for x in _indices_where_in(result, DTN):
            y = x + 1
            if y < len(result):
                z = result[y]
                if z is not None and z in DTN_VOCAL:
                    result[x] = (result[x] or "") + (result[y] or "")
                    result[y] = None

        # 8) m/b/p/v + ě => mě/bě/pě/vě
        for x in _indices_where_in(result, MBPV):
            y = x + 1
            if y < len(result):
                z = result[y]
                if z is not None and z in MBPV_VOCAL:
                    result[x] = (result[x] or "") + (result[y] or "")
                    result[y] = None

        # Final devoicing: if last symbol is voiced, replace with its pair
        if result:
            last_idx = len(result) - 1
            z = result[last_idx]
            if z is not None and z in PAIRED_VOICED and z in PAIRED_CONSONANTS:
                result[last_idx] = PAIRED_CONSONANTS[z]

        # Remove None values
        result_clean: list[str] = [x for x in result if x is not None]

        # Convert to IPA
        result_ipa: list[str] = []
        for token in result_clean:
            temp_val = TEMP.get(token, token)
            ipa_val = IPA.get(temp_val, temp_val)
            result_ipa.append(ipa_val)

        return "".join(result_ipa)

    @staticmethod
    def _get_punct_phonemes(text: str) -> str:
        """Get phonemes for punctuation tokens."""
        puncts = frozenset(";:,.!?-\"'()[]—…")
        return "".join("—" if c == "-" else c for c in text if c in puncts)


[docs]
    def lookup(self, word: str, tag: str | None = None) -> str | None:
        """Look up a word in the dictionary.

        For Czech, this just converts the word to phonemes using rules.

        Args:
            word: The word to look up.
            tag: Optional POS tag (not used for Czech).

        Returns:
            Phoneme string.
        """
        return self._word_to_phonemes(word)



[docs]
    def get_target_model(self) -> str:
        """Get the target Kokoro model variant for this G2P instance.

        Returns:
            Model identifier: version string ("1.1" or "1.0").
        """
        return self.version