Source code for kokorog2p.token

"""Token dataclass for G2P processing."""

from dataclasses import dataclass, field
from typing import Any


[docs] @dataclass class GToken: """ A token representing a word or text unit with optional phoneme information. Attributes: text: The original text of the token. tag: Part-of-speech tag (e.g., 'NN', 'VB', 'JJ'). whitespace: Trailing whitespace after this token. phonemes: The phonemic transcription of the token. start_ts: Start timestamp for audio alignment. end_ts: End timestamp for audio alignment. rating: Quality rating of the phoneme transcription. _: Extension dictionary for custom attributes. """ text: str tag: str = "" whitespace: str = " " phonemes: str | None = None start_ts: float | None = None end_ts: float | None = None rating: str | None = None _: dict[str, Any] = field(default_factory=dict) @property def has_phonemes(self) -> bool: """Check if this token has phonemes assigned.""" return self.phonemes is not None and len(self.phonemes) > 0 @property def is_punctuation(self) -> bool: """Check if this token is punctuation.""" return self.tag in ( ".", ",", ":", ";", "!", "?", "-", "'", '"', "(", ")", "PUNCT", ) @property def is_word(self) -> bool: """Check if this token is a word (not punctuation or whitespace).""" return bool(self.text.strip()) and not self.is_punctuation
[docs] def get(self, key: str, default: Any = None) -> Any: """Get a custom attribute from the extension dict.""" return self._.get(key, default)
[docs] def set(self, key: str, value: Any) -> None: """Set a custom attribute in the extension dict.""" self._[key] = value
[docs] def copy(self) -> "GToken": """Create a shallow copy of this token.""" return GToken( text=self.text, tag=self.tag, whitespace=self.whitespace, phonemes=self.phonemes, start_ts=self.start_ts, end_ts=self.end_ts, rating=self.rating, _=dict(self._), )
[docs] def __repr__(self) -> str: """Return a string representation of the token.""" if self.phonemes: return ( f"GToken({self.text!r}, tag={self.tag!r}, phonemes={self.phonemes!r})" ) return f"GToken({self.text!r}, tag={self.tag!r})"