From 2bc41e2c623233dabc8c73df538c4384642405d2 Mon Sep 17 00:00:00 2001 From: Eevee Date: Sat, 22 Aug 2009 01:13:34 -0700 Subject: [PATCH] Added support for lookup by other language name. #15 English fuzzy matches are preferred, followed by Roomaji and then everything else. The return tuple from lookup() now has a `name` parameter for the actual name that was matched. --- pokedex/__init__.py | 8 ++++-- pokedex/lookup.py | 62 +++++++++++++++++++++++++++++---------------- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/pokedex/__init__.py b/pokedex/__init__.py index 4af3fa0..73391dc 100644 --- a/pokedex/__init__.py +++ b/pokedex/__init__.py @@ -66,8 +66,12 @@ def command_lookup(name): else: print "Fuzzy-matched:" - for object, language, exact in results: - print object.__tablename__, object.name, language + for result in results: + print "%s: %s" % (result.object.__tablename__, result.object.name), + if result.language: + print "(%s in %s)" % (result.name, result.language) + else: + print def command_help(): diff --git a/pokedex/lookup.py b/pokedex/lookup.py index 0bf1c18..2653b9a 100644 --- a/pokedex/lookup.py +++ b/pokedex/lookup.py @@ -95,6 +95,10 @@ def open_index(directory=None, session=None, recreate=False): writer = index.writer() # Index every name in all our tables of interest + # speller_entries becomes a list of (word, score) tuples; the score is 2 + # for English names, 1.5 for Roomaji, and 1 for everything else. I think + # this biases the results in the direction most people expect, especially + # when e.g. German names are very similar to English names speller_entries = [] for cls in indexed_tables.values(): q = session.query(cls) @@ -108,28 +112,31 @@ def open_index(directory=None, session=None, recreate=False): name = row.name.lower() writer.add_document(name=name, **row_key) - speller_entries.append(name) + speller_entries.append((name, 1)) for extra_key_func in extra_keys.get(cls, []): extra_key = extra_key_func(row) writer.add_document(name=extra_key, **row_key) # Pokemon also get other languages - if cls == tables.Pokemon: - for foreign_name in row.foreign_names: - name = foreign_name.name.lower() - writer.add_document(name=name, - language=foreign_name.language.name, - **row_key) - speller_entries.append(name) + for foreign_name in getattr(row, 'foreign_names', []): + moonspeak = foreign_name.name.lower() + if name == moonspeak: + # Don't add the English name again as a different language; + # no point and it makes spell results confusing + continue - if foreign_name.language.name == 'Japanese': - # Add Roomaji too - roomaji = romanize(foreign_name.name).lower() - writer.add_document(name=roomaji, - language='Roomaji', - **row_key) - speller_entries.append(roomaji) + writer.add_document(name=moonspeak, + language=foreign_name.language.name, + **row_key) + speller_entries.append((moonspeak, 3)) + + # Add Roomaji too + if foreign_name.language.name == 'Japanese': + roomaji = romanize(foreign_name.name).lower() + writer.add_document(name=roomaji, language='Roomaji', + **row_key) + speller_entries.append((roomaji, 8)) writer.commit() @@ -138,18 +145,20 @@ def open_index(directory=None, session=None, recreate=False): # at once, as every call to add_* does a commit(), and those seem to be # expensive speller = whoosh.spelling.SpellChecker(index.storage) - speller.add_words(speller_entries) + speller.add_scored_words(speller_entries) return index, speller -LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact']) +LookupResult = namedtuple('LookupResult', + ['object', 'name', 'language', 'exact']) def lookup(name, session=None, indices=None, exact_only=False): """Attempts to find some sort of object, given a database session and name. - Returns a list of named (object, language, exact) tuples. `object` is a - database object, `language` is the name of the language in which the name - was found, and `exact` is True iff this was an exact match. + Returns a list of named (object, name, language, exact) tuples. `object` + is a database object, `name` is the name under which the object was found, + `language` is the name of the language in which the name was found, and + `exact` is True iff this was an exact match. This function currently ONLY does fuzzy matching if there are no exact matches. @@ -209,6 +218,12 @@ def lookup(name, session=None, indices=None, exact_only=False): seen = {} for result in results: # Skip dupe results + # Note! The speller prefers English names, but the query does not. So + # "latias" comes over "ratiasu". "latias" matches only the English + # row, comes out first, and all is well. + # However! The speller could then return "foo" which happens to be the + # name for two different things in different languages, and the + # non-English one could appear preferred. This is not very likely. seen_key = result['table'], result['row_id'] if seen_key in seen: continue @@ -216,6 +231,9 @@ def lookup(name, session=None, indices=None, exact_only=False): cls = indexed_tables[result['table']] obj = session.query(cls).get(result['row_id']) - objects.append(LookupResult(obj, result['language'], exact)) + objects.append(LookupResult(object=obj, + name=result['name'], + language=result['language'], + exact=exact)) - return objects + return objects[:5]