import numpy as np
X = np.zeros((100,5), dtype='bool')
features =["bread","milk","cheese","apples","bananas"]for i inrange(X.shape[0]):if np.random.random()<0.3:# A bread winner
X[i][0]=1if np.random.random()<0.5:# Who likes milk
X[i][1]=1if np.random.random()<0.2:# Who likes cheese
X[i][2]=1if np.random.random()<0.25:# Who likes apples
X[i][3]=1if np.random.random()<0.5:# Who likes bananas
X[i][4]=1else:# Not a bread winner
X[i][0]=0if np.random.random()<0.5:# Who like milk
X[i][1]=1if np.random.random()<0.2:# Who likes cheese
X[i][2]=1if np.random.random()<0.25:# Who likes apples
X[i][3]=1if np.random.random()<0.5:# Who likes bananas
X[i][4]=1else:if np.random.random()<0.8:# Who likes cheese
X[i][2]=1if np.random.random()<0.6:# Who likes apples
X[i][3]=1if np.random.random()<0.7:# Who likes bananas
X[i][4]=1if X[i].sum()==0:
X[i][4]=1;# Must buy something, so gets bananasprint(X[:5])
np.savetxt("affinity_dataset.txt", X, fmt="%d")
5 处理数据
import numpy as np
dataset_filename ="affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples,n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples,n_features))
features =['bread','milk','cheese','apples','bananas']
num_apple_purchases =0for sample in X:if sample[3]==1:
num_apple_purchases +=1print("{0} people bought Apples".format(num_apple_purchases))
rule_valid =0
rule_invalid =0for sample in X:if sample[3]==1:if sample[4]==1:
rule_valid +=1else:
rule_invalid +=1print("{0} cases of the rule being valid were discovered".format(rule_valid))print("{0} cases of the rule being invalid were discovered".format(rule_invalid))
output
21 cases of the rule being valid were discovered
15 cases of the rule being invalid were discovered
下面计算规则置信度:
support = rule_valid
confidence = rule_valid / num_apple_purchases
print("The support is {0} and the confidence is {1:.3f}".format(support,confidence))print("As a percentage,that is {0:.1f}%".format(100*confidence))
output:
The support is 21 and the confidence is 0.583
As a percentage,that is 58.3%
下面开始挖掘任意规则:(假设只有一种规则形式:购买A的人会购买B A->B)
from collections import defaultdict
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)for sample in X:for premise inrange(n_features):if sample[premise]==0:continue
num_occurences[premise]+=1for conclusion inrange(n_features):if premise == conclusion:continueif sample[conclusion]==1:
valid_rules[(premise,conclusion)]+=1else:
invalid_rules[(premise,conclusion)]+=1
support = valid_rules
confidence = defaultdict(float)for premise,conclusion in valid_rules.keys():
confidence[(premise,conclusion)]= valid_rules[(premise,conclusion)]/num_occurences[premise]
输出规则:
for premise,conclusion in confidence.keys():
premise_name = features[premise]
conclusion_name = features[conclusion]print("Rule: If a person buys {0} they will alse buy {1}".format(premise_name,conclusion_name))print(" - Confidence: {0:.3f}".format(confidence[(premise,conclusion)]))print(" - Support: {0}".format(support[(premise,conclusion)]))print("")
Output:
Rule: If a person buys cheese they will alse buy apples
Confidence: 0.610
Support: 25
Rule: If a person buys cheese they will alse buy bananas
Confidence: 0.659
Support: 27
Rule: If a person buys apples they will alse buy cheese
Confidence: 0.694
Support: 25
Rule: If a person buys apples they will alse buy bananas
Confidence: 0.583
Support: 21
Rule: If a person buys bananas they will alse buy cheese
Confidence: 0.458
Support: 27
Rule: If a person buys bananas they will alse buy apples
Confidence: 0.356
Support: 21
Rule: If a person buys bread they will alse buy milk
Confidence: 0.519
Support: 14
Rule: If a person buys bread they will alse buy apples
Confidence: 0.185
Support: 5
Rule: If a person buys milk they will alse buy bread
Confidence: 0.304
Support: 14
Rule: If a person buys milk they will alse buy apples
Confidence: 0.196
Support: 9
Rule: If a person buys apples they will alse buy bread
Confidence: 0.139
Support: 5
Rule: If a person buys apples they will alse buy milk
Confidence: 0.250
Support: 9
Rule: If a person buys bread they will alse buy cheese
Confidence: 0.148
Support: 4
Rule: If a person buys cheese they will alse buy bread
Confidence: 0.098
Support: 4
Rule: If a person buys milk they will alse buy bananas
Confidence: 0.413
Support: 19
Rule: If a person buys bananas they will alse buy milk
Confidence: 0.322
Support: 19
Rule: If a person buys bread they will alse buy bananas
Confidence: 0.630
Support: 17
Rule: If a person buys bananas they will alse buy bread
Confidence: 0.288
Support: 17
Rule: If a person buys milk they will alse buy cheese
Confidence: 0.152
Support: 7
Rule: If a person buys cheese they will alse buy milk
defprint_rule(premise,conclusion,support,confidence,features):
premise_name = features[premise]
conclusion_name = features[conclusion]print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))print(" - Support: {0}".format(support[(premise, conclusion)]))print("")
from operator import itemgetter
sorted_support =sorted(support.items(),key=itemgetter(1),reverse=True)
查看置信度最高的前5条规则:
for index inrange(5):print("Rule #{0}".format(index+1))(premise,conclusion)= sorted_support[index][0]
print_rule(premise,conclusion,support,confidence,features)
Rule #1
Rule: If a person buys cheese they will also buy bananas
Confidence: 0.659
Support: 27
Rule #2
Rule: If a person buys bananas they will also buy cheese
Confidence: 0.458
Support: 27
Rule #3
Rule: If a person buys cheese they will also buy apples
Confidence: 0.610
Support: 25
Rule #4
Rule: If a person buys apples they will also buy cheese
Confidence: 0.694
Support: 25
Rule #5
Rule: If a person buys apples they will also buy bananas