Trie樹,也叫字典樹、字首樹。可用於”predictive text”和”autocompletion”。亦可用於統計詞頻(邊插入Trie樹邊更新或加入詞頻)。
在電腦科學中。trie,又稱字首樹或字典樹。是一種有序樹,用於儲存關聯陣列,當中的鍵一般是字串。與二叉查詢樹不同。鍵不是直接儲存在節點中,而是由節點在樹中的位置決定。一個節點的全部子孫都有同樣的字首,也就是這個節點相應的字串,而根節點相應空字串。
普通情況下,不是全部的節點都有相應的值,僅僅有葉子節點和部分內部節點所相應的鍵才有相關的值。
參考資料:http://zh.wikipedia.org/wiki/Trie
#!/usr/bin/python
# -*- coding:utf-8 -*-
# * trie, prefix tree, can be used as a dict
# * author: yangxudongsuda@gmail.com
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
# Singleton sentinel - works with pickling
class NULL(object):
pass
class Node:
def __init__(self, value = NULL):
self.value = value
self.children = {}
class Trie(object):
def __init__(self):
self.root = Node()
def insert(self, key, value = None, sep = ' '): # key is a word sequence separated by 'sep'
elements = key if isinstance(key, list) else key.split(sep)
node = self.root
for e in elements:
if not e: continue
if e not in node.children:
child = Node()
node.children[e] = child
node = child
else:
node = node.children[e]
node.value = value
def get(self, key, default = None, sep = ' '):
elements = key if isinstance(key, list) else key.split(sep)
node = self.root
for e in elements:
if e not in node.children:
return default
node = node.children[e]
return default if node.value is NULL else node.value
def delete(self, key, sep = ' '):
elements = key if isinstance(key, list) else key.split(sep)
return self.__delete(elements)
def __delete(self, elements, node = None, i = 0):
node = node if node else self.root
e = elements[i]
if e in node.children:
child_node = node.children[e]
if len(elements) == (i+1):
if child_node.value is NULL: return False # not in dict
if len(child_node.children) == 0:
node.children.pop(e)
else:
child_node.value = NULL
return True
elif self.__delete(elements, child_node, i+1):
if len(child_node.children) == 0:
return node.children.pop(e)
return True
return False
def shortest_prefix(self, key, default = NULL, sep = ' '):
elements = key if isinstance(key, list) else key.split(sep)
results = []
node = self.root
value = node.value
for e in elements:
if e in node.children:
results.append(e)
node = node.children[e]
value = node.value
if value is not NULL:
return sep.join(results)
else:
break
if value is NULL:
if default is not NULL:
return default
else:
raise Exception("no item matches any prefix of the given key!")
return sep.join(results)
def longest_prefix(self, key, default = NULL, sep = ' '):
elements = key if isinstance(key, list) else key.split(sep)
results = []
node = self.root
value = node.value
for e in elements:
if e not in node.children:
if value is not NULL:
return sep.join(results)
elif default is not NULL:
return default
else:
raise Exception("no item matches any prefix of the given key!")
results.append(e)
node = node.children[e]
value = node.value
if value is NULL:
if default is not NULL:
return default
else:
raise Exception("no item matches any prefix of the given key!")
return sep.join(results)
def longest_prefix_value(self, key, default = NULL, sep = ' '):
elements = key if isinstance(key, list) else key.split(sep)
node = self.root
value = node.value
for e in elements:
if e not in node.children:
if value is not NULL:
return value
elif default is not NULL:
return default
else:
raise Exception("no item matches any prefix of the given key!")
node = node.children[e]
value = node.value
if value is not NULL:
return value
if default is not NULL:
return default
raise Exception("no item matches any prefix of the given key!")
def longest_prefix_item(self, key, default = NULL, sep = ' '):
elements = key if isinstance(key, list) else key.split(sep)
node = self.root
value = node.value
results = []
for e in elements:
if e not in node.children:
if value is not NULL:
return (sep.join(results), value)
elif default is not NULL:
return default
else:
raise Exception("no item matches any prefix of the given key!")
results.append(e)
node = node.children[e]
value = node.value
if value is not NULL:
return (sep.join(results), value)
if default is not NULL:
return (sep.join(results), default)
raise Exception("no item matches any prefix of the given key!")
def __collect_items(self, node, path, results, sep):
if node.value is not NULL:
results.append((sep.join(path), node.value))
for k, v in node.children.iteritems():
path.append(k)
self.__collect_items(v, path, results, sep)
path.pop()
return results
def items(self, prefix, sep = ' '):
elements = prefix if isinstance(prefix, list) else prefix.split(sep)
node = self.root
for e in elements:
if e not in node.children:
return []
node = node.children[e]
results = []
path = [prefix]
self.__collect_items(node, path, results, sep)
return results
def keys(self, prefix, sep = ' '):
items = self.items(prefix, sep)
return [key for key,value in items]
if __name__ == '__main__':
trie = Trie()
trie.insert('happy 站臺', 1)
trie.insert('happy 站臺 xx', 10)
trie.insert('happy 站臺 xx yy', 11)
trie.insert('happy 站臺 美食 購物 廣場', 2)
trie.insert('sm')
trie.insert('sm 國際', 22)
trie.insert('sm 國際 廣場', 2)
trie.insert('sm 城市廣場', 3)
trie.insert('sm 廣場', 4)
trie.insert('sm 新生活 廣場', 5)
trie.insert('sm 購物 廣場', 6)
trie.insert('soho 尚都', 3)
print trie.get('sm')
print trie.longest_prefix([], default="empty list")
print trie.longest_prefix('sm')
print trie.shortest_prefix('happy 站臺')
print trie.shortest_prefix('happy 站臺 xx')
print trie.shortest_prefix('sm')
print trie.longest_prefix('sm xx', sep = '&', default = None)
print 'sm 廣場 --> ', trie.get('sm 廣場')
print trie.get('sm 廣場'.split(' '))
print trie.get('神馬')
print trie.get('happy 站臺')
print trie.get('happy 站臺 美食 購物 廣場')
print trie.longest_prefix('soho 廣場', 'default')
print trie.longest_prefix('soho 尚都 廣場')
print trie.longest_prefix_value('soho 尚都 廣場')
print trie.longest_prefix_value('xx 尚都 廣場', 90)
print trie.longest_prefix_value('xx 尚都 廣場', 'no prefix')
print trie.longest_prefix_item('soho 尚都 廣場')
print '============== keys ================='
print 'prefix "sm": ', ' | '.join(trie.keys('sm'))
print '============== items ================='
print 'prefix "sm": ', trie.items('sm')
print '================= delete ====================='
print trie.delete('sm 廣場')
print trie.get('sm 廣場')
print trie.delete('sm 國際')
print trie.get('sm 國際')
print trie.delete('sm xx')
print trie.delete('xx')
print '====== no item matches any prefix of given key ========'
print trie.longest_prefix_value('happy')
print trie.longest_prefix_value('soho xx')
執行結果:
None
empty list
sm
happy 站臺
happy 站臺
sm
None
sm 廣場 --> 4
4
None
1
2
default
soho 尚都
3
90
no prefix
('soho \xe5\xb0\x9a\xe9\x83\xbd', 3)
============== keys =================
prefix "sm": sm | sm 新生活 廣場 | sm 城市廣場 | sm 廣場 | sm 購物 廣場 | sm 國際 | sm 國際 廣場
============== items =================
prefix "sm": [('sm', None), ('sm \xe6\x96\xb0\xe7\x94\x9f\xe6\xb4\xbb \xe5\xb9\xbf\xe5\x9c\xba', 5), ('sm \xe5\x9f\x8e\xe5\xb8\x82\xe5\xb9\xbf\xe5\x9c\xba', 3), ('sm \xe5\xb9\xbf\xe5\x9c\xba', 4), ('sm \xe8\xb4\xad\xe7\x89\xa9 \xe5\xb9\xbf\xe5\x9c\xba', 6),
('sm \xe5\x9b\xbd\xe9\x99\x85', 22), ('sm \xe5\x9b\xbd\xe9\x99\x85 \xe5\xb9\xbf\xe5\x9c\xba', 2)]
================= delete =====================
True
None
True
None
False
False
====== no item matches any prefix of given key ========
Traceback (most recent call last):
File "./word_based_trie.py", line 225, in <module>
print trie.longest_prefix_value('happy')
File "./word_based_trie.py", line 128, in longest_prefix_value
raise Exception("no item matches any prefix of the given key!")
Exception: no item matches any prefix of the given key!