"""Japanese G2P (Grapheme-to-Phoneme) converter.
This module provides Japanese text to phoneme conversion using
pyopenjtalk or cutlet for analysis and custom IPA mapping.
Based on misaki's Japanese implementation.
Copyright 2024 kokorog2p contributors
Licensed under the Apache License, Version 2.0
"""
from kokorog2p.base import G2PBase
from kokorog2p.token import GToken
from kokorog2p.tokenization import ensure_gtoken_positions
# Katakana to phoneme mapping
M2P = {
chr(12449): "a", # ァ
chr(12450): "a", # ア
chr(12451): "i", # ィ
chr(12452): "i", # イ
chr(12453): "u", # ゥ
chr(12454): "u", # ウ
chr(12455): "e", # ェ
chr(12456): "e", # エ
chr(12457): "o", # ォ
chr(12458): "o", # オ
chr(12459): "ka", # カ
chr(12460): "ga", # ガ
chr(12461): "ki", # キ
chr(12462): "gi", # ギ
chr(12463): "ku", # ク
chr(12464): "gu", # グ
chr(12465): "ke", # ケ
chr(12466): "ge", # ゲ
chr(12467): "ko", # コ
chr(12468): "go", # ゴ
chr(12469): "sa", # サ
chr(12470): "za", # ザ
chr(12471): "ɕi", # シ
chr(12472): "ʥi", # ジ
chr(12473): "su", # ス
chr(12474): "zu", # ズ
chr(12475): "se", # セ
chr(12476): "ze", # ゼ
chr(12477): "so", # ソ
chr(12478): "zo", # ゾ
chr(12479): "ta", # タ
chr(12480): "da", # ダ
chr(12481): "ʨi", # チ
chr(12482): "ʥi", # ヂ
chr(12484): "ʦu", # ツ
chr(12485): "zu", # ヅ
chr(12486): "te", # テ
chr(12487): "de", # デ
chr(12488): "to", # ト
chr(12489): "do", # ド
chr(12490): "na", # ナ
chr(12491): "ni", # ニ
chr(12492): "nu", # ヌ
chr(12493): "ne", # ネ
chr(12494): "no", # ノ
chr(12495): "ha", # ハ
chr(12496): "ba", # バ
chr(12497): "pa", # パ
chr(12498): "hi", # ヒ
chr(12499): "bi", # ビ
chr(12500): "pi", # ピ
chr(12501): "fu", # フ
chr(12502): "bu", # ブ
chr(12503): "pu", # プ
chr(12504): "he", # ヘ
chr(12505): "be", # ベ
chr(12506): "pe", # ペ
chr(12507): "ho", # ホ
chr(12508): "bo", # ボ
chr(12509): "po", # ポ
chr(12510): "ma", # マ
chr(12511): "mi", # ミ
chr(12512): "mu", # ム
chr(12513): "me", # メ
chr(12514): "mo", # モ
chr(12515): "ja", # ャ
chr(12516): "ja", # ヤ
chr(12517): "ju", # ュ
chr(12518): "ju", # ユ
chr(12519): "jo", # ョ
chr(12520): "jo", # ヨ
chr(12521): "ra", # ラ
chr(12522): "ri", # リ
chr(12523): "ru", # ル
chr(12524): "re", # レ
chr(12525): "ro", # ロ
chr(12526): "wa", # ヮ
chr(12527): "wa", # ワ
chr(12528): "i", # ヰ
chr(12529): "e", # ヱ
chr(12530): "o", # ヲ
chr(12532): "vu", # ヴ
chr(12533): "ka", # ヵ
chr(12534): "ke", # ヶ
chr(12535): "va", # ヷ
chr(12536): "vi", # ヸ
chr(12537): "ve", # ヹ
chr(12538): "vo", # ヺ
}
# Add combination characters
M2P.update(
{
chr(12452) + chr(12455): "je", # イェ
chr(12454) + chr(12451): "wi", # ウィ
chr(12454) + chr(12453): "wu", # ウゥ
chr(12454) + chr(12455): "we", # ウェ
chr(12454) + chr(12457): "wo", # ウォ
chr(12461) + chr(12451): "ᶄi", # キィ
chr(12461) + chr(12455): "ᶄe", # キェ
chr(12461) + chr(12515): "ᶄa", # キャ
chr(12461) + chr(12517): "ᶄu", # キュ
chr(12461) + chr(12519): "ᶄo", # キョ
chr(12462) + chr(12451): "ᶃi", # ギィ
chr(12462) + chr(12455): "ᶃe", # ギェ
chr(12462) + chr(12515): "ᶃa", # ギャ
chr(12462) + chr(12517): "ᶃu", # ギュ
chr(12462) + chr(12519): "ᶃo", # ギョ
chr(12463) + chr(12449): "Ka", # クァ
chr(12463) + chr(12451): "Ki", # クィ
chr(12463) + chr(12453): "Ku", # クゥ
chr(12463) + chr(12455): "Ke", # クェ
chr(12463) + chr(12457): "Ko", # クォ
chr(12463) + chr(12526): "Ka", # クヮ
chr(12464) + chr(12449): "Ga", # グァ
chr(12464) + chr(12451): "Gi", # グィ
chr(12464) + chr(12453): "Gu", # グゥ
chr(12464) + chr(12455): "Ge", # グェ
chr(12464) + chr(12457): "Go", # グォ
chr(12464) + chr(12526): "Ga", # グヮ
chr(12471) + chr(12455): "ɕe", # シェ
chr(12471) + chr(12515): "ɕa", # シャ
chr(12471) + chr(12517): "ɕu", # シュ
chr(12471) + chr(12519): "ɕo", # ショ
chr(12472) + chr(12455): "ʥe", # ジェ
chr(12472) + chr(12515): "ʥa", # ジャ
chr(12472) + chr(12517): "ʥu", # ジュ
chr(12472) + chr(12519): "ʥo", # ジョ
chr(12473) + chr(12451): "si", # スィ
chr(12474) + chr(12451): "zi", # ズィ
chr(12481) + chr(12455): "ʨe", # チェ
chr(12481) + chr(12515): "ʨa", # チャ
chr(12481) + chr(12517): "ʨu", # チュ
chr(12481) + chr(12519): "ʨo", # チョ
chr(12482) + chr(12455): "ʥe", # ヂェ
chr(12482) + chr(12515): "ʥa", # ヂャ
chr(12482) + chr(12517): "ʥu", # ヂュ
chr(12482) + chr(12519): "ʥo", # ヂョ
chr(12484) + chr(12449): "ʦa", # ツァ
chr(12484) + chr(12451): "ʦi", # ツィ
chr(12484) + chr(12455): "ʦe", # ツェ
chr(12484) + chr(12457): "ʦo", # ツォ
chr(12486) + chr(12451): "ti", # ティ
chr(12486) + chr(12455): "ƫe", # テェ
chr(12486) + chr(12515): "ƫa", # テャ
chr(12486) + chr(12517): "ƫu", # テュ
chr(12486) + chr(12519): "ƫo", # テョ
chr(12487) + chr(12451): "di", # ディ
chr(12487) + chr(12455): "ᶁe", # デェ
chr(12487) + chr(12515): "ᶁa", # デャ
chr(12487) + chr(12517): "ᶁu", # デュ
chr(12487) + chr(12519): "ᶁo", # デョ
chr(12488) + chr(12453): "tu", # トゥ
chr(12489) + chr(12453): "du", # ドゥ
chr(12491) + chr(12451): "ɲi", # ニィ
chr(12491) + chr(12455): "ɲe", # ニェ
chr(12491) + chr(12515): "ɲa", # ニャ
chr(12491) + chr(12517): "ɲu", # ニュ
chr(12491) + chr(12519): "ɲo", # ニョ
chr(12498) + chr(12451): "çi", # ヒィ
chr(12498) + chr(12455): "çe", # ヒェ
chr(12498) + chr(12515): "ça", # ヒャ
chr(12498) + chr(12517): "çu", # ヒュ
chr(12498) + chr(12519): "ço", # ヒョ
chr(12499) + chr(12451): "ᶀi", # ビィ
chr(12499) + chr(12455): "ᶀe", # ビェ
chr(12499) + chr(12515): "ᶀa", # ビャ
chr(12499) + chr(12517): "ᶀu", # ビュ
chr(12499) + chr(12519): "ᶀo", # ビョ
chr(12500) + chr(12451): "ᶈi", # ピィ
chr(12500) + chr(12455): "ᶈe", # ピェ
chr(12500) + chr(12515): "ᶈa", # ピャ
chr(12500) + chr(12517): "ᶈu", # ピュ
chr(12500) + chr(12519): "ᶈo", # ピョ
chr(12501) + chr(12449): "fa", # ファ
chr(12501) + chr(12451): "fi", # フィ
chr(12501) + chr(12455): "fe", # フェ
chr(12501) + chr(12457): "fo", # フォ
chr(12511) + chr(12451): "ᶆi", # ミィ
chr(12511) + chr(12455): "ᶆe", # ミェ
chr(12511) + chr(12515): "ᶆa", # ミャ
chr(12511) + chr(12517): "ᶆu", # ミュ
chr(12511) + chr(12519): "ᶆo", # ミョ
chr(12522) + chr(12451): "ᶉi", # リィ
chr(12522) + chr(12455): "ᶉe", # リェ
chr(12522) + chr(12515): "ᶉa", # リャ
chr(12522) + chr(12517): "ᶉu", # リュ
chr(12522) + chr(12519): "ᶉo", # リョ
chr(12532) + chr(12449): "va", # ヴァ
chr(12532) + chr(12451): "vi", # ヴィ
chr(12532) + chr(12455): "ve", # ヴェ
chr(12532) + chr(12457): "vo", # ヴォ
chr(12532) + chr(12515): "ᶀa", # ヴャ
chr(12532) + chr(12517): "ᶀu", # ヴュ
chr(12532) + chr(12519): "ᶀo", # ヴョ
}
)
# Special characters
M2P["ッ"] = "ʔ"
M2P["ン"] = "ɴ"
M2P["ー"] = "ː"
# Punctuation mapping
PUNCT_MAP = {
"«": '"',
"»": '"',
"、": ",",
"。": ".",
"〈": '"',
"〉": '"',
"《": '"',
"》": '"',
"「": '"',
"」": '"',
"『": '"',
"』": '"',
"【": '"',
"】": '"',
"!": "!",
"(": "(",
")": ")",
":": ":",
";": ";",
"?": "?",
}
PUNCT_VALUES = frozenset('!"(),.:;?—""…')
PUNCT_STARTS = frozenset('("')
PUNCT_STOPS = frozenset('!),.:;?"')
TAILS = frozenset([v[-1] for v in M2P.values()])
VOWELS = frozenset("aeiou")
[docs]
class JapaneseG2P(G2PBase):
"""Japanese G2P using pyopenjtalk or cutlet.
Example:
>>> g2p = JapaneseG2P()
>>> tokens = g2p("こんにちは")
"""
[docs]
def __init__(
self,
language: str = "ja",
use_espeak_fallback: bool = True,
use_spacy: bool = False,
spacy_model: str = "ja_core_news_sm",
backend: str = "pyopenjtalk",
unk: str = "",
load_silver: bool = True,
load_gold: bool = True,
version: str = "1.0",
**kwargs,
) -> None:
"""Initialize the Japanese G2P.
Args:
language: Language code (e.g., 'ja', 'ja-jp').
use_espeak_fallback: Whether to use espeak for unknown words.
use_spacy: Reserved for API consistency. Japanese uses
pyopenjtalk/cutlet backends for tokenization and phonemization.
spacy_model: Reserved for API consistency when use_spacy is enabled.
backend: Backend to use ("pyopenjtalk" or "cutlet").
unk: Unknown token placeholder.
load_silver: If True, load silver tier dictionary if available.
Currently Japanese doesn't use dictionary system, so this
parameter is reserved for future use and consistency.
Defaults to True for consistency.
load_gold: If True, load gold tier dictionary if available.
Currently Japanese doesn't use dictionary system, so this
parameter is reserved for future use and consistency.
Defaults to True for consistency.
version: Model version ("1.0" for base, "1.1" for multilingual).
Default: "1.0".
**kwargs: Additional arguments.
"""
super().__init__(language=language, use_espeak_fallback=use_espeak_fallback)
self.backend = backend
self.version = version
self.unk = unk
self.use_spacy = use_spacy
self.spacy_model = spacy_model
self.load_silver = load_silver
self.load_gold = load_gold
self._pyopenjtalk = None
self._cutlet = None
@property
def pyopenjtalk(self):
"""Lazy import of pyopenjtalk."""
if self._pyopenjtalk is None:
import pyopenjtalk
self._pyopenjtalk = pyopenjtalk
return self._pyopenjtalk
@property
def cutlet(self):
"""Lazy initialization of Cutlet backend."""
if self._cutlet is None and self.backend == "cutlet":
from kokorog2p.ja.cutlet import Cutlet
self._cutlet = Cutlet()
return self._cutlet
[docs]
@staticmethod
def pron2moras(pron: str) -> list[str]:
"""Convert pronunciation to mora list."""
moras = []
for k in pron:
if k not in M2P:
continue
if moras and moras[-1] + k in M2P:
moras[-1] += k
else:
moras.append(k)
return moras
[docs]
def __call__(self, text: str) -> list[GToken]:
"""Convert text to tokens with phonemes.
Args:
text: Input text to convert.
Returns:
List of GToken objects with phonemes.
"""
if not text or not text.strip():
return []
# Phonemize using the internal method
phonemes, tokens = self._phonemize_internal(text)
if tokens:
ensure_gtoken_positions(tokens, text)
return tokens
# Create a single token if no detailed tokens
token = GToken(
text=text,
tag="X",
whitespace="",
phonemes=phonemes if phonemes else None,
)
token.rating = "ja" if phonemes else None
tokens = [token]
ensure_gtoken_positions(tokens, text)
return tokens
def _phonemize_internal(self, text: str) -> tuple[str, list[GToken] | None]:
"""Internal phonemization logic.
Args:
text: Input text.
Returns:
Tuple of (phoneme_string, token_list).
"""
if self.cutlet is not None:
return self.cutlet(text)
# Use pyopenjtalk
return self._phonemize_pyopenjtalk(text)
def _phonemize_pyopenjtalk(self, text: str) -> tuple[str, list[GToken] | None]:
"""Phonemize using pyopenjtalk."""
tokens = []
last_a, _last_p = 0, ""
acc, mcount = None, 0
for word in self.pyopenjtalk.run_frontend(text):
pron, mora_size = word["pron"], word["mora_size"]
moras = []
if mora_size > 0:
moras = self.pron2moras(pron)
chain_flag = (
mora_size > 0
and tokens
and tokens[-1].get("mora_size", 0) > 0
and (word["chain_flag"] == 1 or (moras and moras[0] == "ー"))
)
if not chain_flag:
acc, mcount = None, 0
acc = word["acc"] if acc is None else acc
accents = []
for _ in moras:
mcount += 1
if acc == 0:
accents.append(0 if mcount == 1 else (1 if last_a == 0 else 2))
elif acc == mcount:
accents.append(3)
elif 1 < mcount < acc:
accents.append(1 if last_a == 0 else 2)
else:
accents.append(0)
last_a = accents[-1] if accents else 0
surface = word["string"]
if surface in PUNCT_MAP:
surface = PUNCT_MAP[surface]
whitespace, phonemes, pitch = "", None, None
if moras:
phonemes, pitch = "", ""
for m, a in zip(moras, accents, strict=False):
ps = M2P.get(m, "")
phonemes += ps
pitch += ("_" if a == 0 else ("^" if a == 3 else "-")) * len(ps)
elif surface and all(s in PUNCT_VALUES for s in surface):
phonemes = surface
if surface[-1] in PUNCT_STOPS:
whitespace = " "
if tokens:
tokens[-1].whitespace = ""
elif (
surface[-1] in PUNCT_STARTS and tokens and not tokens[-1].whitespace
):
tokens[-1].whitespace = " "
if (
tokens
and phonemes is None
and surface == "・"
or (surface and not surface.strip())
):
tokens[-1].whitespace = " "
continue
tk = GToken(
text=surface,
tag=word["pos"],
whitespace=whitespace,
phonemes=phonemes,
)
# Store extra data in extension dict
tk._["pron"] = pron
tk._["acc"] = word["acc"]
tk._["mora_size"] = mora_size
tk._["chain_flag"] = chain_flag
tk._["moras"] = moras
tk._["accents"] = accents
tk._["pitch"] = pitch
tokens.append(tk)
# Build result string
result, pitch_str = "", ""
for tk in tokens:
if tk.phonemes is None:
result += self.unk + tk.whitespace
pitch_str += "j" * len(self.unk + tk.whitespace)
continue
if (
tk.get("mora_size")
and not tk.get("chain_flag")
and result
and result[-1] in TAILS
and tk.get("moras")
and tk.get("moras")[0] != "ン"
):
result += " "
pitch_str += "j"
result += tk.phonemes + tk.whitespace
tk_pitch = tk.get("pitch")
pitch_str += (
("j" * len(tk.phonemes)) if tk_pitch is None else tk_pitch
) + "j" * len(tk.whitespace)
if tokens and tokens[-1].whitespace and result.endswith(tokens[-1].whitespace):
result = result[: -len(tokens[-1].whitespace)]
pitch_str = pitch_str[: len(result)]
return result + pitch_str, tokens
[docs]
def lookup(self, word: str, tag: str | None = None) -> str | None:
"""Look up a word's phonemes.
Args:
word: The word to look up.
tag: Optional POS tag (ignored for Japanese).
Returns:
Phoneme string or None.
"""
if not word:
return None
result, _ = self._phonemize_internal(word)
return result if result else None
[docs]
def phonemize(self, text: str) -> str:
"""Convert text to phonemes.
Args:
text: Input text to convert.
Returns:
Phoneme string.
"""
result, _ = self._phonemize_internal(text)
return result
def __repr__(self) -> str:
return f"JapaneseG2P(language={self.language!r}, backend={self.backend!r})"
[docs]
def get_target_model(self) -> str:
"""Get the target Kokoro model variant for this G2P instance.
Returns:
Model identifier: version string ("1.1" or "1.0").
"""
return self.version