inverted index 反向索引 python

一、简单版

from collections import defaultdict
class inverted_index:

    def __init__(self, docs):
        self.doc = defaultdict(set)
        for index, doc in enumerate(docs):
            for term in doc.split():
                self.doc[term].add(index)

    def search(self, term):
        return self.doc[term]


if __name__ == "__main__":
    docs = ["new home sales top forecasts june june june",
            "home sales rise in july june",
            "increase in home sales in july",
            "july new home sales new rise"]

    i = inverted_index(docs)
    a = 1
    print(i.search('sales'))

# 结果：{0, 1, 2, 3}

二、 nltk 单词版本，反向索引存储 -> search

# 【2】构建nltk中的单词 反向索引
"""
结果1(储存index)：
run_time =  0.012042999267578125

结果2(储存单词)：
run_time =  0.02428269386291504

结论：存储index比存储单词速度快一倍左右
"""

import time
from nltk.corpus import words
from collections import defaultdict

inverted_index = defaultdict(set)  # 如果同一个单词出现了重复的char，只会记录一次，属于某行，不能用default(list)
word_list = words.words()
a = 1

# 结果1存储index速度会更快，相对于存储单词
for i, word in enumerate(word_list):
    for char in word.lower():
        inverted_index[char].add(i)

# 结果2
# for i, word in enumerate(word_list):
#     for char in word.lower():
#         inverted_index[char].add(word)

# 需要搜索某个单词是否再哪一行, idx 用set.intersection()
start = time.time()
result = set.intersection(*(inverted_index[char] for char in "aej"))
end = time.time()
print('run_time = ', end-start)
print('result = ', result)
print('result_item = ', [word_list[i] for i in result])

def intersection(*args):
  left = args[0]
  # Perform len(args)-1 pairwise-intersections
  for right in args[1:]:

    # Tests take O(N) time, so minimize N by choosing the smaller set
    if len(left) > len(right):
      left, right = right, left

    # Do the pairwise intersection
    result = set()
    for element in left:
      if element in right:
        result.add(element)

    left = result  # Use as the start for the next intersection

  return left

三、

inverted index 反向索引 python

浏览过的版块