Source code for kokorog2p.pt.g2p

"""Brazilian Portuguese G2P (Grapheme-to-Phoneme) converter.

A rule-based Grapheme-to-Phoneme engine for Brazilian Portuguese,
designed for Kokoro TTS.

Brazilian Portuguese Phonology Features:
- 7 oral vowels (a, e, ɛ, i, o, ɔ, u) with open/closed e/o variants
- 5 nasal vowels (ã, ẽ, ĩ, õ, ũ)
- Nasal diphthongs (ãw̃, õj̃, etc.)
- Palatalization: lh [ʎ], nh [ɲ], x/ch [ʃ]
- Affrication: t+i [ʧ], d+i [ʤ] (Brazilian Portuguese feature)
- Sibilants: s [s/z], x [ʃ], z [z]
- Liquids: r [ʁ/x/h] (varies by dialect), rr [ʁ/x], single r [ɾ]
- No θ sound (unlike European Portuguese)

Reference:
https://en.wikipedia.org/wiki/Portuguese_phonology
https://en.wikipedia.org/wiki/Brazilian_Portuguese
"""

import re
import unicodedata
from typing import Any, Final

from kokorog2p.base import G2PBase
from kokorog2p.pipeline.tokenizer import SpacyTokenizer
from kokorog2p.pt.normalizer import PortugueseNormalizer
from kokorog2p.token import GToken
from kokorog2p.tokenization import ensure_gtoken_positions

# =============================================================================
# Brazilian Portuguese Grapheme-to-Phoneme Mappings
# =============================================================================

# Oral vowels (7 vowels in stressed position)
ORAL_VOWELS: Final[frozenset[str]] = frozenset("aeiouɛɔ")

# Vowels that can be nasalized
NASAL_VOWELS: Final[str] = "aeiou"

# Simple consonants that don't change much
SIMPLE_CONSONANTS: Final[dict[str, str]] = {
    "b": "b",
    "f": "f",
    "k": "k",
    "p": "p",
    "v": "v",
}



[docs]
class PortugueseG2P(G2PBase):
    """Brazilian Portuguese G2P converter using rule-based phonemization.

    This class provides grapheme-to-phoneme conversion for Brazilian Portuguese text
    using Portuguese orthographic rules.

    Example:
        >>> g2p = PortugueseG2P()
        >>> tokens = g2p("Olá, como está?")
        >>> for token in tokens:
        ...     print(f"{token.text} -> {token.phonemes}")
    """

    # Punctuation normalization map
    _PUNCT_MAP = {
        chr(171): '"',  # «
        chr(187): '"',  # »
        chr(8216): "'",  # '
        chr(8217): "'",  # '
        chr(8220): '"',  # "
        chr(8221): '"',  # "
        chr(8212): "-",  # —
        chr(8211): "-",  # –
        chr(8230): "...",  # …
    }

    # Small lexicon for exceptional words
    _LEXICON: dict[str, str] = {
        # Common words
        "e": "i",  # Conjunction "and"
        "é": "ɛˈ",  # "is" (stressed open e)
        # Add more as needed
    }


[docs]
    def __init__(
        self,
        language: str = "pt-br",
        use_espeak_fallback: bool = False,
        use_spacy: bool = False,
        spacy_model: str = "pt_core_news_sm",
        mark_stress: bool = True,
        affricate_ti_di: bool = True,  # Affricate t/d before i (Brazilian feature)
        expand_abbreviations: bool = True,
        enable_context_detection: bool = True,
        dialect: str = "br",  # "br" for Brazilian, "pt" for European
        version: str = "1.0",
        **kwargs: Any,
    ) -> None:
        """Initialize the Portuguese G2P converter.

        Args:
            language: Language code (default: 'pt-br').
            use_espeak_fallback: Reserved for future espeak integration.
            use_spacy: Whether to use spaCy for tokenization and POS tagging.
                Defaults to False to preserve existing behavior.
            spacy_model: spaCy Portuguese model package to load when use_spacy=True
                (e.g., "pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg").
            mark_stress: Whether to mark primary stress with ˈ.
            affricate_ti_di: Whether to affricate /t d/ before /i/ (Brazilian feature).
            expand_abbreviations: Whether to expand common abbreviations.
            enable_context_detection: Context-aware abbreviation expansion.
            dialect: "br" for Brazilian, "pt" for European Portuguese.
                     Affects number pronunciation (dezesseis vs dezasseis)
            version: Target model version.
        """
        super().__init__(language=language, use_espeak_fallback=use_espeak_fallback)
        self.version = version
        self.use_spacy = use_spacy
        self.spacy_model = spacy_model
        self.mark_stress = mark_stress
        self.affricate_ti_di = affricate_ti_di
        self.dialect = dialect
        self._nlp: object | None = None
        self._spacy_tokenizer: SpacyTokenizer | None = None

        # Initialize normalizer with dialect support
        self._normalizer = PortugueseNormalizer(
            expand_abbreviations=expand_abbreviations,
            enable_context_detection=enable_context_detection,
            dialect=dialect,
        )



[docs]
    def __call__(self, text: str) -> list[GToken]:
        """Convert text to a list of tokens with phonemes.

        Args:
            text: Input text to convert.

        Returns:
            List of GToken objects with phonemes assigned.
        """
        if not text.strip():
            return []

        # Preprocess
        text = self._preprocess(text)

        # Tokenize
        tokens = self._tokenize_spacy(text) if self.use_spacy else self._tokenize(text)

        # Process tokens
        for token in tokens:
            # Skip tokens that already have phonemes (punctuation)
            if token.phonemes is not None:
                continue

            # Convert word to phonemes
            if token.is_word:
                phonemes = self._word_to_phonemes(token.text)
                if phonemes:
                    token.phonemes = phonemes
                    token.set("rating", 3)  # Rule-based rating

        # Handle remaining unknown words
        for token in tokens:
            if token.phonemes is None and token.is_word:
                token.phonemes = "?"

        ensure_gtoken_positions(tokens, text)
        return tokens


    @property
    def nlp(self) -> object:
        """Lazily initialize spaCy."""
        if self._nlp is None:
            import spacy

            name = self.spacy_model
            if not spacy.util.is_package(name):
                spacy.cli.download(name)  # type: ignore[attr-defined]
            self._nlp = spacy.load(name, enable=["tok2vec", "tagger"])
        return self._nlp

    @property
    def spacy_tokenizer(self) -> SpacyTokenizer:
        """Lazily initialize the spaCy tokenizer."""
        if self._spacy_tokenizer is None:
            self._spacy_tokenizer = SpacyTokenizer(
                nlp=self.nlp,
                track_positions=True,
                use_bracket_matching=True,
                lang=self.language,
            )
        return self._spacy_tokenizer

    def _tokenize_spacy(self, text: str) -> list[GToken]:
        """Tokenize text using spaCy."""
        processing_tokens = self.spacy_tokenizer.tokenize(text)
        tokens: list[GToken] = []

        for ptoken in processing_tokens:
            token = ptoken.to_gtoken()

            if ptoken.text and not any(c.isalnum() for c in ptoken.text):
                token.phonemes = ptoken.text
                token.set("rating", 4)

            tokens.append(token)

        return tokens

    def _preprocess(self, text: str) -> str:
        """Preprocess text before G2P conversion.

        Args:
            text: Raw input text.

        Returns:
            Preprocessed text.
        """
        # Normalize Unicode
        text = unicodedata.normalize("NFC", text)

        # Apply normalizer (abbreviations, temperature, etc.)
        text = self._normalizer(text)

        # Normalize punctuation (keep for legacy compatibility)
        for old, new in self._PUNCT_MAP.items():
            text = text.replace(old, new)

        # Remove non-breaking spaces
        text = text.replace("\u00a0", " ")
        text = text.replace("\u202f", " ")

        # Collapse multiple spaces
        text = re.sub(r" +", " ", text)

        return text.strip()

    def _tokenize(self, text: str) -> list[GToken]:
        """Tokenize text into words and punctuation.

        Args:
            text: Preprocessed text.

        Returns:
            List of GToken objects.
        """
        # Pattern to split on whitespace and capture punctuation
        pattern = r"([^\w'-]+|[\w'-]+)"
        parts = re.findall(pattern, text)

        tokens = []
        for part in parts:
            if not part or part.isspace():
                continue

            # Check if it's a word or punctuation
            if re.match(r"[\w'-]+", part):
                # It's a word
                token = GToken(text=part)
                token.set("is_word", True)
                tokens.append(token)
            else:
                # It's punctuation
                token = GToken(text=part)
                token.set("is_word", False)
                token.phonemes = part  # Punctuation passes through
                tokens.append(token)

        return tokens

    def _normalize_text(self, text: str) -> tuple[str, set[int], set[int]]:
        """Normalize accented characters and track stress positions.

        Args:
            text: Input text with possible accents.

        Returns:
            Tuple of (normalized_text, stressed_vowel_positions,
            open_vowel_positions).
        """
        stressed_vowels = set()
        open_vowels = set()  # Track é/ó (open) vs ê/ô (closed)
        normalized_text: list[str] = []

        for _i, char in enumerate(text):
            if char in "áéíóúâêôãõ":
                # Remember position
                pos = len(normalized_text)
                stressed_vowels.add(pos)
                # Track open vowels (acute accent)
                if char in "éó":
                    open_vowels.add(pos)
                # Normalize
                if char == "á":
                    normalized_text.append("a")
                elif char in ("é", "ê"):
                    normalized_text.append("e")
                elif char == "í":
                    normalized_text.append("i")
                elif char in ("ó", "ô"):
                    normalized_text.append("o")
                elif char == "ú":
                    normalized_text.append("u")
                elif char in ("ã", "õ"):
                    # Keep tilde for later
                    normalized_text.append(char)
            else:
                normalized_text.append(char)

        return "".join(normalized_text), stressed_vowels, open_vowels

    def _process_vowel(
        self,
        text: str,
        i: int,
        n: int,
        stressed_vowels: set[int],
        open_vowels: set[int],
    ) -> tuple[list[str], int]:
        """Process a vowel and possible diphthong.

        Args:
            text: Normalized text.
            i: Current position.
            n: Text length.
            stressed_vowels: Set of stressed vowel positions.
            open_vowels: Set of open vowel positions.

        Returns:
            Tuple of (phonemes, new_position).
        """
        vowel = text[i]
        result = []

        if vowel == "e":
            # Use open ɛ only if stressed AND has acute accent (é)
            if i in stressed_vowels and i in open_vowels:
                result.append("ɛ")
            else:
                result.append("e")
            # Check for eu diphthong -> ew (meu, seu)
            if i + 1 < n and text[i + 1] == "u":
                result.append("w")
                i += 1

        elif vowel == "o":
            # Use open ɔ only if stressed AND has acute accent (ó)
            if i in stressed_vowels and i in open_vowels:
                result.append("ɔ")
            else:
                result.append("o")
            # Check for ou diphthong -> ow (vou, sou)
            if i + 1 < n and text[i + 1] == "u":
                result.append("w")
                i += 1

        elif vowel == "u":
            result.append("u")
            # Check for ui diphthong -> uj (muito)
            if i + 1 < n and text[i + 1] == "i":
                result.append("j")
                i += 1

        elif vowel == "a":
            result.append("a")
            # Check for au diphthong -> aw (Tchau, mau)
            if i + 1 < n and text[i + 1] == "u":
                result.append("w")
                i += 1

        elif vowel == "i":
            result.append("i")

        # Add stress marker if applicable
        if self.mark_stress and i in stressed_vowels:
            result.append("ˈ")

        return result, i + 1

    def _process_t_consonant(
        self, text: str, i: int, n: int, stressed_vowels: set[int]
    ) -> tuple[list[str], int, bool]:
        """Process 't' consonant with possible affrication.

        Returns:
            Tuple of (phonemes, new_position, matched).
        """
        result = []
        matched = False

        if self.affricate_ti_di:
            # Final "te" -> ʧi
            if (
                i + 1 < n
                and text[i + 1] == "e"
                and (i + 1) not in stressed_vowels
                and i + 2 >= n
            ):
                result.extend(["ʧ", "i"])
                return result, i + 2, True
            # t + i (unstressed) -> ʧ
            if i + 1 < n and text[i + 1] == "i" and (i + 1) not in stressed_vowels:
                result.append("ʧ")
                return result, i + 1, True

        result.append("t")
        return result, i + 1, matched

    def _process_d_consonant(
        self, text: str, i: int, n: int, stressed_vowels: set[int]
    ) -> tuple[list[str], int, bool]:
        """Process 'd' consonant with possible affrication.

        Returns:
            Tuple of (phonemes, new_position, matched).
        """
        result = []
        matched = False

        if self.affricate_ti_di:
            # d + i (unstressed) -> ʤ
            if i + 1 < n and text[i + 1] == "i" and (i + 1) not in stressed_vowels:
                result.append("ʤ")
                return result, i + 1, True

        result.append("d")
        return result, i + 1, matched

    def _process_nasal_vowel(
        self, text: str, i: int, n: int, stressed_vowels: set[int]
    ) -> tuple[list[str], int, bool]:
        """Process nasal vowel combination.

        Returns:
            Tuple of (phonemes, new_position, matched).
        """
        if not (
            i + 1 < n
            and text[i] in NASAL_VOWELS
            and text[i + 1] in "mn"
            and (i + 2 >= n or text[i + 2] not in "aeiouãõh")
        ):
            return [], i, False

        result = []
        vowel = text[i]

        # Nasalize vowel
        nasal_map = {"a": "ã", "e": "ẽ", "i": "ĩ", "o": "õ", "u": "ũ"}
        if vowel in nasal_map:
            result.append(nasal_map[vowel])

        # Add stress if needed
        if self.mark_stress and i in stressed_vowels:
            result.append("ˈ")

        # Add nasal consonant
        result.append(text[i + 1])

        return result, i + 2, True

    def _process_multi_char_sequences(
        self, text: str, i: int, n: int
    ) -> tuple[list[str], int, bool]:
        """Process multi-character grapheme sequences.

        Returns:
            Tuple of (phonemes, new_position, matched).
        """
        result = []

        # tch -> ʧ (Tchau, tchau)
        if i + 2 < n and text[i : i + 3] == "tch":
            result.append("ʧ")
            return result, i + 3, True

        # nh -> ɲ (ninho)
        if i + 1 < n and text[i : i + 2] == "nh":
            result.append("ɲ")
            return result, i + 2, True

        # lh -> ʎ (filho)
        if i + 1 < n and text[i : i + 2] == "lh":
            result.append("ʎ")
            return result, i + 2, True

        # ch -> ʃ (chá)
        if i + 1 < n and text[i : i + 2] == "ch":
            result.append("ʃ")
            return result, i + 2, True

        # rr -> r or ʁ (strong r: carro)
        if i + 1 < n and text[i : i + 2] == "rr":
            result.append("r")  # Use r for strong trill
            return result, i + 2, True

        # ss -> s (isso -> iso)
        if i + 1 < n and text[i : i + 2] == "ss":
            result.append("s")
            return result, i + 2, True

        # qu + vowel -> kw or k
        if i + 2 < n and text[i : i + 2] == "qu":
            if text[i + 2] in "ei":
                result.append("k")
            else:
                result.append("k")
                result.append("w")
            return result, i + 2, True

        # gu + vowel -> ɡw or ɡ
        if i + 2 < n and text[i : i + 2] == "gu":
            if text[i + 2] in "ei":
                result.append("ɡ")
            else:
                result.append("ɡ")
                result.append("w")
            return result, i + 2, True

        return [], i, False

    def _process_simple_consonants(
        self, text: str, i: int, n: int, stressed_vowels: set[int]
    ) -> tuple[list[str], int, bool]:
        """Process simple consonants with context rules.

        Returns:
            Tuple of (phonemes, new_position, matched).
        """
        char = text[i]
        result = []

        # Simple consonants (b, f, k, p, v)
        if char in SIMPLE_CONSONANTS:
            result.append(SIMPLE_CONSONANTS[char])
            return result, i + 1, True

        # c: before e/i -> s, otherwise k
        if char == "c":
            if i + 1 < n and text[i + 1] in "ei":
                result.append("s")
            else:
                result.append("k")
            return result, i + 1, True

        # ç -> s
        if char == "ç":
            result.append("s")
            return result, i + 1, True

        # g: before e/i -> ʒ, otherwise ɡ
        if char == "g":
            if i + 1 < n and text[i + 1] in "ei":
                result.append("ʒ")
            else:
                result.append("ɡ")
            return result, i + 1, True

        # j -> ʒ
        if char == "j":
            result.append("ʒ")
            return result, i + 1, True

        # x -> ʃ
        if char == "x":
            result.append("ʃ")
            return result, i + 1, True

        # z: final -> s, otherwise z
        if char == "z":
            if i + 1 >= n:
                result.append("s")
            else:
                result.append("z")
            return result, i + 1, True

        # s: between vowels -> z, otherwise s
        if char == "s":
            if (
                i > 0
                and i + 1 < n
                and text[i - 1] in "aeiouãõ"
                and text[i + 1] in "aeiouãõ"
            ):
                result.append("z")
            else:
                result.append("s")
            return result, i + 1, True

        # r: initial -> r, otherwise ɾ
        if char == "r":
            if i == 0:
                result.append("r")
            else:
                result.append("ɾ")
            return result, i + 1, True

        # l: before consonant/final -> w, otherwise l
        if char == "l":
            if i + 1 >= n or text[i + 1] not in "aeiouãõ":
                result.append("w")
            else:
                result.append("l")
            return result, i + 1, True

        # m, n -> pass through
        if char in "mn":
            result.append(char)
            return result, i + 1, True

        # w, y -> w, j
        if char in "wy":
            if char == "w":
                result.append("w")
            else:
                result.append("j")
            return result, i + 1, True

        return [], i, False

    def _word_to_phonemes(self, word: str) -> str:
        """Convert a single word to phonemes.

        Args:
            word: Word to convert.

        Returns:
            Phoneme string in IPA.
        """
        if not word:
            return ""

        # Check lexicon first
        word_lower = word.lower()
        if word_lower in self._LEXICON:
            base_phonemes = self._LEXICON[word_lower]
            if not self.mark_stress:
                base_phonemes = base_phonemes.replace("ˈ", "")
            return base_phonemes

        # Convert to lowercase for processing
        text = word.lower()

        # Normalize and track stress
        text, stressed_vowels, open_vowels = self._normalize_text(text)

        result: list[str] = []
        i = 0
        n = len(text)

        while i < n:
            matched = False

            # Multi-character sequences first
            phonemes, new_i, was_matched = self._process_multi_char_sequences(
                text, i, n
            )
            if was_matched:
                result.extend(phonemes)
                i = new_i
                matched = True

            # Try nasal combinations if not yet matched
            if not matched:
                phonemes, new_i, was_matched = self._process_nasal_vowel(
                    text, i, n, stressed_vowels
                )
                if was_matched:
                    result.extend(phonemes)
                    i = new_i
                    matched = True

            # Already-nasalized vowels
            if not matched and text[i] in "ãõ":
                result.append(text[i])
                if self.mark_stress and i in stressed_vowels:
                    result.append("ˈ")
                i += 1
                matched = True

            # t/d consonants with affrication
            if not matched and text[i] == "t":
                phonemes, new_i, was_matched = self._process_t_consonant(
                    text, i, n, stressed_vowels
                )
                result.extend(phonemes)
                i = new_i
                matched = True

            if not matched and text[i] == "d":
                phonemes, new_i, was_matched = self._process_d_consonant(
                    text, i, n, stressed_vowels
                )
                result.extend(phonemes)
                i = new_i
                matched = True

            # Other consonants
            if not matched:
                phonemes, new_i, was_matched = self._process_simple_consonants(
                    text, i, n, stressed_vowels
                )
                if was_matched:
                    result.extend(phonemes)
                    i = new_i
                    matched = True

            # Vowels (with possible diphthongs)
            if not matched and text[i] in "aeiou":
                phonemes, new_i = self._process_vowel(
                    text, i, n, stressed_vowels, open_vowels
                )
                result.extend(phonemes)
                i = new_i
                matched = True

            # Unknown character - skip
            if not matched:
                i += 1

        return "".join(result)


[docs]
    def lookup(self, word: str, tag: str | None = None) -> str | None:
        """Look up a word's phonemes.

        Args:
            word: The word to look up.
            tag: Optional POS tag (ignored for Portuguese).

        Returns:
            Phoneme string or None.
        """
        return self._word_to_phonemes(word)



[docs]
    def phonemize(self, text: str) -> str:
        """Convert text to phonemes.

        Args:
            text: Input text to convert.

        Returns:
            Phoneme string.
        """
        tokens = self(text)
        result = []
        for token in tokens:
            if token.phonemes:
                result.append(token.phonemes)
        return " ".join(result)



[docs]
    def get_target_model(self) -> str:
        """Get the target Kokoro model variant for this G2P instance.

        Returns:
            Model identifier: version string ("1.1" or "1.0").
        """
        return self.version