veekun_pokedex/pokedex/roomaji.py

245 lines
10 KiB
Python
Raw Normal View History

# encoding: utf8
2011-01-26 02:14:42 +01:00
"""Provides `romanize()` for romanizing simple Japanese text.
Also provides available romanizers in a dictionary keyed by language identifier.
"""
class Romanizer(object):
def __init__(self, parent=None, **tables):
"""Create a Romanizer
parent: A LookupTables to base this one on
tables: Dicts that become the object's attributes. If a parent is given,
its tables are used, and updated with the given ones
"""
self.parent = parent
if parent:
self.tables = parent.tables
for name, table in tables.items():
# Take a copy -- don't want to clobber the parent's tables
self.tables[name] = dict(self.tables[name])
self.tables[name].update(table)
else:
2011-01-26 02:14:42 +01:00
self.tables = tables
for name, table in self.tables.items():
setattr(self, name, table)
def romanize(self, string):
"""Convert a string of kana to roomaji."""
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
characters = []
last_kana = None # Used for ー; っ or ッ; ん or ン
last_char = None # Used for small kana combos
for char in string:
# Full-width Latin
if 0xff01 <= ord(char) <= 0xff5e:
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot precede Latin characters.")
# XXX Real Unicode decomposition would be nicer
char = chr(ord(char) - 0xff01 + 0x21)
characters.append(char)
last_kana = None
# Small vowel kana
elif char in self.roomaji_small_kana:
combo = last_char + char
if combo in self.roomaji_small_kana_combos:
characters[-1] = self.roomaji_small_kana_combos[combo]
else:
# If we don't know what it is... act dumb and treat it as a
# full-size vowel. Better than bailing, and seems to occur a
# lot, e.g. ピィ is "pii"
characters.append(self.roomaji_small_kana[char])
last_kana = self.roomaji_small_kana[char]
# Youon
elif char in self.roomaji_youon:
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
raise ValueError("Youon must follow an -i sound.")
# Drop the -i and append the ya/yu/yo sound
new_sound = self.roomaji_youon[char]
if last_kana in self.y_drop:
# Strip the y-
new_char = self.y_drop[last_kana] + new_sound[1:]
else:
new_char = last_kana[:-1] + new_sound
characters[-1] = new_char
last_kana = new_char
# Sokuon
elif char in (u'', u''):
# Remember it and double the consonant next time around
last_kana = 'sokuon'
# Extended vowel or n
elif char == u'':
if last_kana[-1] not in vowels:
raise ValueError(u"'' must follow by a vowel.")
if last_kana[-1] in self.lengthened_vowels:
characters[-1] = characters[-1][:-1]
characters.append(self.lengthened_vowels[last_kana[-1]])
else:
characters.append(last_kana[-1])
last_kana = None
# Regular ol' kana
elif char in self.roomaji_kana:
kana = self.roomaji_kana[char]
if last_kana == 'sokuon':
if kana[0] in vowels:
raise ValueError("Sokuon cannot precede a vowel.")
characters.append(kana[0])
elif last_kana == 'n' and kana[0] in vowels:
characters.append("'")
# Special characters fo doubled kana
if kana[0] in self.lengthened_vowels and characters and kana == characters[-1][-1]:
kana = self.lengthened_vowels[kana[0]]
characters[-1] = characters[-1][:-1]
characters.append(kana)
last_kana = kana
# Not Japanese?
else:
if last_kana == 'sokuon':
raise ValueError("Sokuon must be followed by another kana.")
characters.append(char)
last_kana = None
last_char = char
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.")
return unicode(''.join(characters))
romanizers = dict()
romanizers['en'] = Romanizer(
roomaji_kana={
# Hiragana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
u'': 'ya', u'': 'yu', u'': 'yo',
u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
u'': 'wa', u'': 'wi', u'': 'we', u'': 'wo',
u'': 'n',
u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
u'': 'da', u'': 'ji', u'': 'dzu', u'': 'de', u'': 'do',
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
# Katakana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
u'': 'ya', u'': 'yu', u'': 'yo',
u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
u'': 'wa', u'': 'wi', u'': 'we', u'': 'wo',
u'': 'n',
u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
u'': 'da', u'': 'ji', u'': 'dzu', u'': 'de', u'': 'do',
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
u'': 'vu',
},
roomaji_youon={
# Hiragana
u'': 'ya', u'': 'yu', u'': 'yo',
# Katakana
u'': 'ya', u'': 'yu', u'': 'yo',
},
# XXX If romanize() ever handles hiragana, it will need to make sure that the
# preceding character was a katakana
# This does not include every small kana combination, but should include every
# one used in a Pokémon name. An exhaustive list would be.. very long
roomaji_small_kana={
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
},
roomaji_small_kana_combos={
# These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
# particularly weird, but it seems to be what GF intends
# Simple vowel replacement
u'ウィ': 'wi', u'ウゥ': 'wu', u'ウェ': 'we', u'ウォ': 'wo',
u'ヴァ': 'va', u'ヴィ': 'vi', u'ヴェ': 've', u'ヴォ': 'vo',
u'チェ': 'che',
u'シェ': 'she',
u'ジェ': 'je',
u'テァ': 'tha', u'ティ': 'ti', u'テゥ': 'thu', u'テェ': 'tye', u'テォ': 'tho',
u'デァ': 'dha', u'ディ': 'di', u'デゥ': 'dhu', u'デェ': 'dye', u'デォ': 'dho',
u'ファ': 'fa', u'フィ': 'fi', u'ホゥ': 'hu', u'フェ': 'fe', u'フォ': 'fo',
# Not so much
u'シィ': 'sy',
u'ミィ': 'my',
u'ビィ': 'by',
u'ピィ': 'py',
},
lengthened_vowels={},
y_drop={'chi': 'ch', 'shi': 'sh', 'ji': 'j'},
)
romanizers['cs'] = Romanizer(parent=romanizers['en'],
roomaji_kana={
u'': u'ši', u'': u'či', u'': u'cu',
u'': u'ja', u'': u'ju', u'': u'jo',
u'': u'dži', u'': u'dži',
u'': u'ši', u'': u'či', u'': u'cu',
u'': u'ja', u'': u'ju', u'': 'jo',
u'': u'dži', u'': u'dži',
},
roomaji_youon={
u'': 'ja', u'': 'ju', u'': 'jo',
u'': 'ja', u'': 'ju', u'': 'jo',
},
roomaji_small_kana_combos={
u'チェ': u'če', u'シェ': u'še', u'ジェ': u'dže',
u'テェ': u'tje', u'デェ': u'dje',
u'シィ': u'', u'ミィ': u'', u'ビィ': u'', u'ピィ': u'',
},
lengthened_vowels={'a': u'á', 'e': u'é', 'i': u'í', 'o': u'ó', 'u': u'ú'},
y_drop={u'či': u'č', u'ši': u'š', u'dži': u'', u'ni': u'ňj'},
)
def romanize(string, lang='en'):
"""Convert a string of kana to roomaji."""
# Get the correct romanizer; fall back to English
romanizer = romanizers.get(lang, 'en')
# Romanize away!
return romanizer.romanize(string)