|
一、简单版
from collections import defaultdict
class inverted_index:
def __init__(self, docs):
self.doc = defaultdict(set)
for index, doc in enumerate(docs):
for term in doc.split():
self.doc[term].add(index)
def search(self, term):
return self.doc[term]
if __name__ == "__main__":
docs = ["new home sales top forecasts june june june",
"home sales rise in july june",
"increase in home sales in july",
"july new home sales new rise"]
i = inverted_index(docs)
a = 1
print(i.search('sales'))
# 结果:{0, 1, 2, 3}
二、 nltk 单词版本,反向索引 存储 -> search
# 【2】构建nltk中的单词 反向索引
"""
结果1(储存index):
run_time = 0.012042999267578125
结果2(储存单词):
run_time = 0.02428269386291504
结论:存储index比存储单词速度快一倍左右
"""
import time
from nltk.corpus import words
from collections import defaultdict
inverted_index = defaultdict(set) # 如果同一个单词出现了重复的char,只会记录一次,属于某行,不能用default(list)
word_list = words.words()
a = 1
# 结果1存储index速度会更快,相对于存储单词
for i, word in enumerate(word_list):
for char in word.lower():
inverted_index[char].add(i)
# 结果2
# for i, word in enumerate(word_list):
# for char in word.lower():
# inverted_index[char].add(word)
# 需要搜索某个单词是否再哪一行, idx 用set.intersection()
start = time.time()
result = set.intersection(*(inverted_index[char] for char in "aej"))
end = time.time()
print('run_time = ', end-start)
print('result = ', result)
print('result_item = ', [word_list[i] for i in result])
def intersection(*args):
left = args[0]
# Perform len(args)-1 pairwise-intersections
for right in args[1:]:
# Tests take O(N) time, so minimize N by choosing the smaller set
if len(left) > len(right):
left, right = right, left
# Do the pairwise intersection
result = set()
for element in left:
if element in right:
result.add(element)
left = result # Use as the start for the next intersection
return left
三、 |