""" Trie structure (Object) by James Tauber, improved by leonardo maffi, V. 1.2, July 18 2006. Other dict methods that can be added to this Trie object: len(t) the number of items in t with a value different from None __iter__ t.copy() t (shallow) copy of t repr(t) str(t) t.items() a copy of a's list of (key, value) pairs t.keys() a copy of a's list of keys t.update([b]) updates (and overwrites) key/value pairs from b t.fromkeys(seq[, value]) Creates a new dictionary with keys from seq and values set to value t.values() a copy of t's list of values t.get(k[, x]) a[k] if k in t, else x t.iteritems() return an iterator over (key, value) pairs t.iterkeys() return an iterator over the mapping's keys t.itervalues() return an iterator over the mapping's values Can elements be deleted? """ class _Novalue_class: def __repr__(self): return "Novalue" Novalue = _Novalue_class() class Trie: """A Trie is like a dictionary in that it maps keys to values. However, because of the way keys are stored, it allows look up based on the longest prefix that matches. Keys must be strings (modifying Trie.convert maybe it works with general sequences too).""" # James Tauber, http://jtauber.com/, # http://www.jtauber.com/blog/2005/02/10/updated_python_trie_implementation def __init__(self, trie=None): if trie is None: self._root = [Novalue, {}] else: self._root = trie def clear(self): self._root = [Novalue, {}] def __str__(self): return str(self._root) def __repr__(self): return "Trie(%r)" % self._root def __eq__(self, other): return self._root == other._root def __setitem__(self, key, value): curr_node = self._root for char in key: curr_node = curr_node[1].setdefault(char, [Novalue, {}]) curr_node[0] = value def __getitem__(self, key): """Return the value for the given key if it is present, raises a KeyError if key not found, and return Novalue if it is present a key2 that starts with key.""" curr_node = self._root for char in key: curr_node = curr_node[1][char] return curr_node[0] def __contains__(self, key): """Implements the in operator. Return True if the key is present or if it is present a key2 string that starts with key.""" curr_node = self._root for char in key: curr_node_1 = curr_node[1] if char in curr_node_1: curr_node = curr_node_1[char] else: return False return True def isleaf(self, key): """Return True if the key is present and it's a leaf of the Trie, False otherwise.""" curr_node = self._root for char in key: curr_node_1 = curr_node[1] if char in curr_node_1: curr_node = curr_node_1[char] else: return False return curr_node[0] is not Novalue def find_prefix(self, key): """Find as much of the key as one can, by using the longest prefix that has a value. Return (value, remainder) where remainder is the rest of the given string.""" curr_node = self._root remainder = key for char in key: if char in curr_node[1]: curr_node = curr_node[1][char] else: return curr_node[0], remainder # This pregressive slicing may be slow with CPython! (and fast enough with Psyco) remainder = remainder[1:] return curr_node[0], remainder def subtree(self, key): sub_tree = self.__class__() # Necessary for a good subclassing. raise NotImplemented __all__ = ["Novalue", "Trie"] if __name__ == "__main__": t = Trie() # t = Trie([Novalue, {}]) t["foo"] = "A" # t = Trie([Novalue, {'f': [Novalue, {'o': [Novalue, {'o': ['A', {}]}]}]}]) assert "fo" in t assert t["fo"] == Novalue assert t.isleaf("foo") assert not t.isleaf("fo") assert not t.isleaf("f") assert not t.isleaf("X") t["fo"] = "B" # t = Trie([Novalue, {'f': [Novalue, {'o': ['B', {'o': ['A', {}]}]}]}]) t["l"] = "C" # t = Trie([Novalue, {'l': ['C', {}], 'f': [Novalue, {'o': ['B', {'o': ['A', {}]}]}]}]) t["for"] = "D" # t = Trie([Novalue, {'l': ['C', {}], 'f': [Novalue, {'o': ['B', {'r': ['D', {}], 'o': ['A', {}]}]}]}]) assert t.isleaf("fo") assert t.isleaf("for") assert t["fo"] == "B" try: t["fool"] except KeyError: pass else: raise assert t.find_prefix("fo") == ("B", "") assert t.find_prefix("fool") == ("A", "l") assert "fo" in t assert "foo" in t assert "fool" not in t t.clear() # t = Trie([Novalue, {}]) t["ba"] = "A" # t = Trie([Novalue, {'b': [Novalue, {'a': ['A', {}]}]}]) t["b"] = "B" t["bar"] = "C" t["baz"] = "D" t["sp"] = "E" t["spa"] = "F" # t = Trie([Novalue, {'s': [Novalue, {'p': ['E', {'a': ['F', {}]}]}], 'b': ['B', {'a': ['A', {'r': ['C', {}], 'z': ['D', {}]}]}]}]) t1 = Trie() t1["b"] = 1 t2 = Trie([Novalue, {'b': [1, {}]}]) assert t1 == t2 """ # Untested t2 = t.subtree("ba") """ """ # ----------------------------------------------------------- # Speed test, for tuning from time import clock from random import choice, seed from string import uppercase seed(20) def rand_word(): return "".join(choice(uppercase) for i in xrange(5)) import psyco; psyco.full() words = file("WORDS.TXT").read().split("\n") n = 100000 words_in = [choice(words) for i in xrange(n)] words_notin = [rand_word() for i in xrange(n)] time = clock() t = Trie() for word in words: t[word] = None print round(clock()-time, 1) # 4.4 With Psyco: 3.5 time = clock() for word in words_in: word in t print round(clock()-time, 1) # 2.7s with n = 100000 time = clock() for word in words_notin: word in t print round(clock()-time, 1) # 1.1s with n = 100000 """