inverted index 反向索引 python

论坛 期权论坛 脚本     
匿名技术用户   2021-1-14 14:39   152   0

一、简单版

from collections import defaultdict
class inverted_index:

    def __init__(self, docs):
        self.doc = defaultdict(set)
        for index, doc in enumerate(docs):
            for term in doc.split():
                self.doc[term].add(index)

    def search(self, term):
        return self.doc[term]


if __name__ == "__main__":
    docs = ["new home sales top forecasts june june june",
            "home sales rise in july june",
            "increase in home sales in july",
            "july new home sales new rise"]

    i = inverted_index(docs)
    a = 1
    print(i.search('sales'))

# 结果:{0, 1, 2, 3}

二、 nltk 单词版本,反向索引 存储 -> search

# 【2】构建nltk中的单词 反向索引
"""
结果1(储存index):
run_time =  0.012042999267578125

结果2(储存单词):
run_time =  0.02428269386291504

结论:存储index比存储单词速度快一倍左右
"""

import time
from nltk.corpus import words
from collections import defaultdict

inverted_index = defaultdict(set)  # 如果同一个单词出现了重复的char,只会记录一次,属于某行,不能用default(list)
word_list = words.words()
a = 1

# 结果1存储index速度会更快,相对于存储单词
for i, word in enumerate(word_list):
    for char in word.lower():
        inverted_index[char].add(i)

# 结果2
# for i, word in enumerate(word_list):
#     for char in word.lower():
#         inverted_index[char].add(word)

# 需要搜索某个单词是否再哪一行, idx 用set.intersection()
start = time.time()
result = set.intersection(*(inverted_index[char] for char in "aej"))
end = time.time()
print('run_time = ', end-start)
print('result = ', result)
print('result_item = ', [word_list[i] for i in result])
def intersection(*args):
  left = args[0]
  # Perform len(args)-1 pairwise-intersections
  for right in args[1:]:

    # Tests take O(N) time, so minimize N by choosing the smaller set
    if len(left) > len(right):
      left, right = right, left

    # Do the pairwise intersection
    result = set()
    for element in left:
      if element in right:
        result.add(element)

    left = result  # Use as the start for the next intersection

  return left

三、

分享到 :
0 人收藏
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

积分:7942463
帖子:1588486
精华:0
期权论坛 期权论坛
发布
内容

下载期权论坛手机APP