# 计算用户之间的余弦相似度defUserSimilarity(user_items):# 构造物品-用户倒排表
item_users = {}
for user, item_set in user_items.iteritems():
for k in item_set:
if k notin item_users:
item_users[k] = set()
item_users[k].add(user)
# 有共同浏览记录的用户的重合度
C = {} # C(u,v)
N = {} # N(u), N(v)for item, user_set in item_users.iteritems():
for u in user_set:
N[u] = 0if u notin N else N[u] # 初始化
C[u] = {} if u notin C else C[u] # 初始化
N[u] += 1for v in user_set:
C[u][v] = 0if v notin C[u] else C[u][v] # 初始化if(u!=v):
C[u][v] += 1# 计算最终的相似度矩阵 W
W = {} # W(u,v)for u, related_users in C.iteritems():
W[u] = {} if u notin W else W[u] # 初始化for v, Cuv in related_users.iteritems():
if Cuv!=0:
W[u][v] = 0if v notin W[u] else W[u][v] # 初始化
W[u][v] = Cuv / np.sqrt(N[u] * N[v])
return W
测试用例:
# 测试用例
user_items = {"Allen": set(['a','b','d']),
"Ben": set(['a','c']),
"Copper":set(['b','e']),
"Denny": set(['c','d','e'])}
# 计算用户余弦相似度
W = UserSimilarity(user_items)
# 从稀疏表达转矩阵
result = np.zeros((4,4))
users = ['Allen','Ben','Copper','Denny']
for u, related_users in W.iteritems():
for v, w in related_users.iteritems():
result[users.index(u)][users.index(v)] = w
print result
defUserSimilarity_IIF(user_items):# 构造物品-用户倒排表
item_users = {}
for user, item_set in user_items.iteritems():
for k in item_set:
if k notin item_users:
item_users[k] = set()
item_users[k].add(user)
# 有共同浏览记录的用户的重合度
C = {} # C(u,v)
N = {} # N(u), N(v)for item, user_set in item_users.iteritems():
for u in user_set:
N[u] = 0if u notin N else N[u]
C[u] = {} if u notin C else C[u]
N[u] += 1for v in user_set:
C[u][v] = 0if v notin C[u] else C[u][v]
if(u!=v):
C[u][v] += 1 / np.log(1 + len(user_set))
# 计算最终的相似度矩阵 W
W = {} # W(u,v)for u, related_users in C.iteritems():
W[u] = {} if u notin W else W[u] # 初始化for v, Cuv in related_users.iteritems():
if Cuv != 0:
W[u][v] = 0if v notin W[u] else W[u][v] # 初始化
W[u][v] = Cuv / np.sqrt(N[u] * N[v])
return W
defItemSimilarity(item_users):# 构造用户-物品倒排表
user_items = {}
for item, user_set in item_users.iteritems():
for k in user_set:
if k notin user_items:
user_items[k] = set()
user_items[k].add(item)
C = {}
N = {}
for u, item_set in user_items.iteritems():
for i in item_set:
N[i] = 0if i notin N else N[i]
C[i] = {} if i notin C else C[i]
N[i] += 1for j in item_set:
if i!=j:
C[i][j] = 0if j notin C[i] else C[i][j]
C[i][j] += 1# 计算物品相似度
W = {}
for i, related_items in C.iteritems():
W[i] = {} if i notin W else W[i]
for j, cij in related_items.iteritems():
if cij!=0:
W[i][j] = 0if j notin W[i] else W[i][j]
W[i][j] += cij / np.sqrt(N[i] * N[j])
return W
测试用例
item_users = {'a':{"Allen","Eric"},
'b':{'Allen','Ben','Denny'},
'c':{'Ben','Copper','Denny'},
'd':{'Allen','Copper','Denny',"Eric"},
'e':{'Ben'}}
W = ItemSimilarity(item_users)
# 从稀疏表达转矩阵
result = np.zeros((5,5))
items = ['a','b','c','d','e']
for i, related_items in W.iteritems():
for j, w in related_items.iteritems():
result[items.index(i)][items.index(j)] = w
print result
classrecord():def__init__(self, w, r):
self.weight = w
self.reason = r
defRecommand_withReason(user, user_items, W, K):
rank = {}
ru = user_items[user]
for i, pi in ru.iteritems():
for j, wj in sorted(W[i].iteritems(), key=lambda x:x[1], reverse=True)[:K]:
ifnot j in ru:
rank[j] = record(0,{}) if j notin rank else rank[j]
rank[j].weight += pi * wj
rank[j].reason[i] = pi * wj
return rank
测试样例
user_items = {"Allen": {'a':1, 'b':1, 'd':1},
"Ben": {'b':1, 'c':1, 'e':1},
"Copper":{'c':1, 'd':1},
"Denny": {'b':1, 'c':1, 'd':1},
"Eric": {'a':1, 'd':1}}
rank = Recommand_withReason("Allen", user_items, W, K=3)
for recommanded_item, record in rank.iteritems():
print'recommand', recommanded_item, record.weight
print'computed from', [(k,v) for k,v in record.reason.iteritems()]
测试结果
recommand c 1.24401693586
computed from [('b', 0.6666666666666666), ('d', 0.5773502691896258)]
recommand e 0.57735026919
computed from [('b', 0.5773502691896258)]
2.3 改进的ItemCF推荐
ItemCF-IUF
物品之间之所以能进行相似度衡量,是因为它们同时出现在多个用户的兴趣列表中,也就是说,每个用户的兴趣列表都将对物品的相似度产生贡献,但是每个用户的贡献是不同的。假设某用户是开书店的,他从京东上买了80万本书,但是他购买这些书并非出于自身的兴趣,因此他对于书与书之间的相似度产生的贡献应该很小,至少应该远远小于一个只买了几本自己最爱的侦探小说的用户。 论文提出了Item-IUF(Inverse User Frequency),他认为活跃的用户对物品相似度的贡献应该小于不活跃的用户。实际上对于一些过于活跃的用户,会直接排除而不参与相似度计算。
w(i,j)=1|N(i)||N(j)|√∑u∈N(i)∩N(j)1log(1+|N(u)|)
python实现 ItemCF-IUF
与普通的ItemCF相比,ItemCF-IUF只需要在计算C矩阵(分子)时略作修改即可。
defItemSimilarity_IUF(item_users):# 构造用户-物品倒排表
user_items = {}
for item, user_set in item_users.iteritems():
for k in user_set:
if k notin user_items:
user_items[k] = set()
user_items[k].add(item)
C = {}
N = {}
for u, item_set in user_items.iteritems():
for i in item_set:
N[i] = 0if i notin N else N[i]
C[i] = {} if i notin C else C[i]
N[i] += 1for j in item_set:
if i!=j:
C[i][j] = 0if j notin C[i] else C[i][j]
C[i][j] += 1 / np.log(1 + len(item_set))
# 计算物品相似度
W = {}
for i, related_items in C.iteritems():
W[i] = {} if i notin W else W[i]
for j, cij in related_items.iteritems():
if cij!=0:
W[i][j] = 0if j notin W[i] else W[i][j]
W[i][j] += cij / np.sqrt(N[i] * N[j])
return W
测试用例
item_users = {'a':{"Allen","Eric"},
'b':{'Allen','Ben','Denny'},
'c':{'Ben','Copper','Denny'},
'd':{'Allen','Copper','Denny',"Eric"},
'e':{'Ben'}}
W = ItemSimilarity_IUF(item_users)
【论文】Breese J S, Heckerman D, Kadie C. Empirical analysis of predictive algorithms for collaborative filtering[C] // Proc. Conference of Uncertainty in Articial Intelligence. 1998:43-52. 论文链接
【论文】Karypis G. Evaluation of Item-Based Top-N Recommendation Algorithms[C] // Tenth International Conference on Information and Knowledge Management. ACM, 2001:247-254. 论文链接
【书籍】《推荐系统实践》项亮
【书籍】《推荐系统》Dietmar Jannach等