From c40324652e003fe4fd0774a6a3767f9d0ecf6d50 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 4 Feb 2011 06:34:00 +0200 Subject: [PATCH] Add migration script to move English texts to language-specific tables The next commit will apply this script, changing nearly all of the CSV files. When your rebases stops there, run:: git reset --hard python scripts/migration-i18n.py git add -A pokedex/data/csv/ git rebase --continue --- scripts/migration-i18n.py | 272 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 scripts/migration-i18n.py diff --git a/scripts/migration-i18n.py b/scripts/migration-i18n.py new file mode 100644 index 0000000..4c1f99c --- /dev/null +++ b/scripts/migration-i18n.py @@ -0,0 +1,272 @@ +# Encoding: UTF-8 + +"""Moves/transforms values in CSVs in an ad-hoc way, based mainly on column name + +Auto-creates identifiers from names +Auto-creates names from identifiers +Copies IDs for foreign keys +Creates autoincrement-style IDs when missing +Sets text language to 9 (en), except when it sets to 1 (jp) + +And looks good doing it! +""" + +import csv +import re +import os +from StringIO import StringIO +from collections import namedtuple, defaultdict + +from sqlalchemy.orm import class_mapper + +from pokedex.db import tables, load + +english_id = 9 +japanese_id = 1 + +bw_version_group_id = 11 + +dir = load.get_default_csv_dir() + +def tuple_key(tup): + """Return a sort key for mixed int/string tuples. + + Strings sort first. + """ + def generator(): + for item in tup: + try: + yield (1, int(item)) + except ValueError: + yield (0, item) + return tuple(generator()) + +class MakeFieldFuncs: + """Various ways to get a new value from the old one""" + @staticmethod + def copy(field_name, source, **kwargs): + """Plain copy""" + return source[field_name] + + @staticmethod + def main(field_name, source, **kwargs): + """Populate aux table from the main table""" + return source[field_name] + + @staticmethod + def Main(field_name, source, **kwargs): + """Capitalize""" + return source[field_name].capitalize() + + @staticmethod + def ident(source, **kwargs): + """Craft an identifier from the 'identifier' or 'name' column""" + return name2ident(source.get('identifier', source.get('name'))) + + @staticmethod + def Name(source, **kwargs): + """Capitalize the name (or identifier) column""" + name = source.get('name', source.get('identifier', None)) + name = ' '.join(word.capitalize() for word in name.split(' ')) + return name + + @staticmethod + def f_id(source, **kwargs): + """Capitalize the identifier column""" + return source['identifier'].capitalize() + + @staticmethod + def name(source, **kwargs): + """Get the original name""" + return source['name'] + + @staticmethod + def newid(i, source, **kwargs): + """Assign a new "auto-incremented" id""" + source['id'] = i # hack to make srcid work + return i + + @staticmethod + def en(source, **kwargs): + """Assign the value for English -- unless it's Japanese""" + if source.get('version_group_id', None) == str(bw_version_group_id): + return japanese_id + return english_id + + @staticmethod + def srcid(source, field_name, **kwargs): + """The original table's id""" + try: + return source['id'] + except KeyError: + if field_name == 'pokemon_form_group_id': + # This one reuses another table's ID + return source['pokemon_id'] + else: + raise + +def name2ident(name): + ident = name.decode('utf-8').lower() + ident = ident.replace(u'+', ' plus ') + ident = re.sub(u'[ _–]+', u'-', ident) + ident = re.sub(u'[\'./;’(),:]', u'', ident) + ident = ident.replace(u'é', 'e') + ident = ident.replace(u'♀', '-f') + ident = ident.replace(u'♂', '-m') + if ident in ('???', '????'): + ident = 'unknown' + elif ident == '!': + ident = 'exclamation' + elif ident == '?': + ident = 'question' + for c in ident: + assert c in "abcdefghijklmnopqrstuvwxyz0123456789-", repr(ident) + return ident + + +FieldSpec = namedtuple('FieldSpec', 'out name func') + +def main(): + for table in sorted(tables.all_tables(), key=lambda t: t.__name__): + datafilename = dir + '/' + table.__tablename__ + '.csv' + classname = table.__name__ + if hasattr(table, 'object_table'): + # This is an auxilliary table; it'll be processed with the main one + continue + else: + print "%s: %s" % (classname, table.__tablename__) + with open(datafilename) as datafile: + datacsv = csv.reader(datafile, lineterminator='\n') + orig_fields = datacsv.next() + columns = class_mapper(table).c + new_fields = [] + main_out = [] + outputs = {datafilename: main_out} + name_out = None + srcfiles = [datafilename] + # Set new_fields to a list of FieldSpec object, one for each field we want in the csv + for column in columns: + name = column.name + if name == 'identifier': + new_fields.append(FieldSpec(datafilename, column.name, MakeFieldFuncs.ident)) + elif name in orig_fields: + new_fields.append(FieldSpec(datafilename, column.name, MakeFieldFuncs.copy)) + elif name == 'id': + new_fields.append(FieldSpec(datafilename, column.name, MakeFieldFuncs.newid)) + elif name == 'language_id': + new_fields.insert(2, FieldSpec(datafilename, column.name, MakeFieldFuncs.en)) + else: + raise AssertionError(name) + # Remember headers + headers = {datafilename: list(field.name for field in new_fields)} + # Pretty prnt :) + for field in new_fields: + print ' [{0.func.func_name:5}] {0.name}'.format(field) + # Do pretty much the same for aux tables + aux_tables = [] + for attrname in 'text_table prose_table'.split(): + aux_table = getattr(table, attrname, None) + if aux_table: + aux_datafilename = dir + '/' + aux_table.__tablename__ + '.csv' + print " %s: %s" % (aux_table.__name__, aux_table.__tablename__) + srcfiles.append(datafilename) + aux_tables.append(aux_table) + columns = class_mapper(aux_table).c + aux_out = [] + outputs[aux_datafilename] = aux_out + aux_fields = [] + for column in columns: + name = column.name + if name == 'language_id': + aux_fields.insert(1, FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.en)) + elif name == 'name' and table.__name__ == 'ItemFlag': + aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.f_id)) + elif name == 'description' and table.__name__ == 'ItemFlag': + aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.name)) + elif name in orig_fields and name == 'name' and table.__name__ in 'PokemonColor ContestType BerryFirmness'.split(): + # Capitalize these names + aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.Name)) + elif name in orig_fields and name in 'color flavor'.split() and table.__name__ == 'ContestType': + aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.Main)) + elif name in orig_fields: + aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.main)) + elif name == table.__singlename__ + '_id': + aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.srcid)) + elif name == 'name': + aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.Name)) + elif name == 'lang_id': + aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.srcid)) + else: + print orig_fields + raise AssertionError(name) + if name == 'name': + # If this table contains the name, remember that + name_fields = aux_fields + name_out = aux_out + # Sort aux tables nicely + def key(f): + if f.func == MakeFieldFuncs.srcid: + return 0 + elif f.name == 'language_id': + return 1 + elif f.name == 'name': + return 2 + else: + return 10 + aux_fields.sort(key=key) + new_fields += aux_fields + headers[aux_datafilename] = list(field.name for field in aux_fields) + # Pretty print :) + for field in aux_fields: + print ' [{0.func.func_name:5}] {0.name}'.format(field) + # Do nothing if the table's the same + if all(field.func == MakeFieldFuncs.copy for field in new_fields): + print u' → skipping' + continue + # Otherwise read the file + # outputs will be a (filename -> list of rows) dict + print u' → reading' + for autoincrement_id, src_row in enumerate(datacsv, start=1): + row = dict(zip(orig_fields, src_row)) + new_rows = defaultdict(list) + for field in new_fields: + new_rows[field.out].append(field.func( + source=row, + field_name=field.name, + i=autoincrement_id, + )) + for name, row in new_rows.items(): + outputs[name].append(row) + # If there was a _names table, read that and append it to the + # aux table that has names + try: + name_datafilename = dir + '/' + table.__singlename__ + '_names.csv' + name_file = open(name_datafilename) + except (AttributeError, IOError): + pass + else: + print u' → reading foreign names' + with name_file: + namecsv = csv.reader(name_file, lineterminator='\n') + src_fields = namecsv.next() + obj_id_fieldname = table.__singlename__ + '_id' + assert src_fields == [obj_id_fieldname, 'language_id', 'name'] + for name_row in namecsv: + name_dict = dict(zip(src_fields, name_row)) + row = [] + for field in name_fields: + row.append(name_dict.get(field.name, '')) + name_out.append(row) + os.unlink(name_datafilename) + # For all out files, write a header & sorted rows + print u' → writing' + for filename, rows in outputs.items(): + with open(filename, 'w') as outfile: + outcsv = csv.writer(outfile, lineterminator='\n') + outcsv.writerow(headers[filename]) + rows.sort(key=tuple_key) + for row in rows: + outcsv.writerow(row) + +if __name__ == '__main__': + main()