patterns = pyfpgrowth.find_frequent_patterns(transactions, 2) # 频数删选 频数大于2
rules = pyfpgrowth.generate_association_rules(patterns, 0.6) # 置信度(条件概率)删选
print(patterns)
print('===============')
print(rules)
{频繁项:频数}
{(5,): 2, (1, 5): 2, (2, 5): 2, (1, 2, 5): 2, (4,): 2, (2, 4): 2, (1,): 6, (1, 2): 4, (2, 3): 4, (1, 2, 3): 2, (1, 3): 4, (2,): 7}
===============
{关联组合:置信度_即条件概率}
{(5,): ((1, 2), 1.0), (1, 5): ((2,), 1.0), (2, 5): ((1,), 1.0), (4,): ((2,), 1.0), (1,): ((3,), 0.6666666666666666)}
pyfpgrowth包源码:
https://github.com/evandempsey/fp-growth/blob/master/pyfpgrowth/pyfpgrowth.py
import itertools
class FPNode(object):
A node in the FP tree.
def __init__(self, value, count, parent):
Create the node.
self.value = value
self.count = count
self.parent = parent
self.link = None
self.children = []
def has_child(self, value):
Check if node has a particular child node.
for node in self.children:
if node.value == value:
return True
return False
def get_child(self, value):
Return a child node with a particular value.
for node in self.children:
if node.value == value:
return node
return None
def add_child(self, value):
Add a node as a child node.
child = FPNode(value, 1, self)
self.children.append(child)
return child
class FPTree(object):
A frequent pattern tree.
def __init__(self, transactions, threshold, root_value, root_count):
Initialize the tree.
self.frequent = self.find_frequent_items(transactions, threshold)
self.headers = self.build_header_table(self.frequent)
self.root = self.build_fptree(
transactions, root_value,
root_count, self.frequent, self.headers)
@staticmethod
def find_frequent_items(transactions, threshold):
Create a dictionary of items with occurrences above the threshold.
items = {}
for transaction in transactions:
for item in transaction:
if item in items:
items[item] += 1
else:
items[item] = 1
for key in list(items.keys()):
if items[key] < threshold:
del items[key]
return items
@staticmethod
def build_header_table(frequent):
Build the header table.
headers = {}
for key in frequent.keys():
headers[key] = None
return headers
def build_fptree(self, transactions, root_value,
root_count, frequent, headers):
Build the FP tree and return the root node.
root = FPNode(root_value, root_count, None)
for transaction in transactions:
sorted_items = [x for x in transaction if x in frequent]
sorted_items.sort(key=lambda x: frequent[x], reverse=True)
if len(sorted_items) > 0:
self.insert_tree(sorted_items, root, headers)
return root
def insert_tree(self, items, node, headers):
Recursively grow FP tree.
first = items[0]
child = node.get_child(first)
if child is not None:
child.count += 1
else:
# Add new child.
child = node.add_child(first)
# Link it to header structure.
if headers[first] is None:
headers[first] = child
else:
current = headers[first]
while current.link is not None:
current = current.link
current.link = child
# Call function recursively.
remaining_items = items[1:]
if len(remaining_items) > 0:
self.insert_tree(remaining_items, child, headers)
def tree_has_single_path(self, node):
If there is a single path in the tree,
return True, else return False.
num_children = len(node.children)
if num_children > 1:
return False
elif num_children == 0:
return True
else:
return True and self.tree_has_single_path(node.children[0])
def mine_patterns(self, threshold):
Mine the constructed FP tree for frequent patterns.
if self.tree_has_single_path(self.root):
return self.generate_pattern_list()
else:
return self.zip_patterns(self.mine_sub_trees(threshold))
def zip_patterns(self, patterns):
Append suffix to patterns in dictionary if
we are in a conditional FP tree.
suffix = self.root.value
if suffix is not None:
# We are in a conditional tree.
new_patterns = {}
for key in patterns.keys():
new_patterns[tuple(sorted(list(key) + [suffix]))] = patterns[key]
return new_patterns
return patterns
def generate_pattern_list(self):
Generate a list of patterns with support counts.
patterns = {}
items = self.frequent.keys()
# If we are in a conditional tree,
# the suffix is a pattern on its own.
if self.root.value is None:
suffix_value = []
else:
suffix_value = [self.root.value]
patterns[tuple(suffix_value)] = self.root.count
for i in range(1, len(items) + 1):
for subset in itertools.combinations(items, i):
pattern = tuple(sorted(list(subset) + suffix_value))
patterns[pattern] = \
min([self.frequent[x] for x in subset])
return patterns
def mine_sub_trees(self, threshold):
Generate subtrees and mine them for patterns.
patterns = {}
mining_order = sorted(self.frequent.keys(),
key=lambda x: self.frequent[x])
# Get items in tree in reverse order of occurrences.
for item in mining_order:
suffixes = []
conditional_tree_input = []
node = self.headers[item]
# Follow node links to get a list of
# all occurrences of a certain item.
while node is not None:
suffixes.append(node)
node = node.link
# For each occurrence of the item,
# trace the path back to the root node.
for suffix in suffixes:
frequency = suffix.count
path = []
parent = suffix.parent
while parent.parent is not None:
path.append(parent.value)
parent = parent.parent
for i in range(frequency):
conditional_tree_input.append(path)
# Now we have the input for a subtree,
# so construct it and grab the patterns.
subtree = FPTree(conditional_tree_input, threshold,
item, self.frequent[item])
subtree_patterns = subtree.mine_patterns(threshold)
# Insert subtree patterns into main patterns dictionary.
for pattern in subtree_patterns.keys():
if pattern in patterns:
patterns[pattern] += subtree_patterns[pattern]
else:
patterns[pattern] = subtree_patterns[pattern]
return patterns
def find_frequent_patterns(transactions, support_threshold):
Given a set of transactions, find the patterns in it
over the specified support threshold.
tree = FPTree(transactions, support_threshold, None, None)
return tree.mine_patterns(support_threshold)
def generate_association_rules(patterns, confidence_threshold):
Given a set of frequent itemsets, return a dict
of association rules in the form
{(left): ((right), confidence)}
rules = {}
for itemset in patterns.keys():
upper_support = patterns[itemset]
for i in range(1, len(itemset)):
for antecedent in itertools.combinations(itemset, i):
antecedent = tuple(sorted(antecedent))
consequent = tuple(sorted(set(itemset) - set(antecedent)))
if antecedent in patterns:
lower_support = patterns[antecedent]
confidence = float(upper_support) / lower_support
if confidence >= confidence_threshold:
rules[antecedent] = (consequent, confidence)
return rules
关联
规则学习概述
在大型数据库中发现变量之间有趣关系的方法,目的是利用一些有趣的度量识别数据库中的强规则。基于强规则的概念,Rakesh Agrawal等人引入了
关联
规则以发现由超市的POS系统记录的大批交易数据中产品之间的规律性。例如,从销售数据中发现的规则{薄饼,鸡蛋}->{火腿肠},表明如果顾客一起买薄饼和鸡蛋,他们也有可能买火腿肠(这些顾客是想早饭吃手抓饼吧,哈哈),此类信息可以为大卖场商品组合促销或电商网站产品推荐等相关商业决策提供依据。除了上述的购物篮分析外,
关联
规则还被应用在许多领域中,
本文为博主原创文章,转载请注明出处,并附上原文链接。
原文链接:
Fp
G
row
th
算法
,全称:Frequent Pattern G
row
th
—-频繁模式增长,该
算法
是Apriori
算法
的改进版本(若是不会这个
算法
或是有点遗忘了,可以移步到我的这篇博客),我们知道Apriori
算法
为了产生频繁模式项集,需要对数据库多次扫描,当数据库内容太大,那么
算法
运行的时间是难以忍受的,因此有人提出了
Fp
G
row
th
算法
,只须扫描数据库两次即可求出频繁项集,大
您可以使用pip安装该软件包:
pip install
pyfpg
row
th
然后,要在项目中使用它,请将其导入并使用find_frequent_patterns和generate_association_rules函数:
import
pyfpg
row
th
假定您的交易是表示购物篮中项目的一系列序列。 商品ID是整数:
transactions = [[1, 2, 5],
[2, 4],
[2, 3],
[1, 2, 4],
[1, 3],
[2, 3],
[1, 3],
在上篇文章频繁项集挖掘实战和
关联
规则产生.中我们实现了Apriori的购物篮实战和由频繁项集产生
关联
规则, 本文沿《数据挖掘概念与技术》的主线继续学习
FP
-g
row
th
。因《数据挖掘概念与技术》中
FP
-g
row
th
内容过于琐碎且不易理解,我们的内容主要参考了《机器学习实战》第12章的内容。本文是对书中内容的通俗理解和代码实现,更详细的理论知识请参考书中内容, 本文涉及的完整jupyter代码和《机器学习实战》全书配套代码包, 可以在我们的 “数据臭皮匠” 中输入"第六章3" 拿到
Apriori
算法
是经典算
关联
规则(Association Rules)是反映一个事物与其他事物之间的相互依存性和
关联
性,是数据挖掘的一个重要技术,用于从大量数据中挖掘出有价值的数据项之间的相关关系。
常见的购物篮分析
该过程通过发现顾客放人其购物篮中的不同商品之间的联系,分析顾客的购买习惯。通过了解哪些商品频繁地被顾客同时购买,这种
关联
的发现可以帮助零售商制定营销策略。其他的应用还包括价目表设计、商品促销、商品的排放和基于购买模式的顾...
def __init__(self,nameValue,numOccur,parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
亲和性分析根据样本个体之间的关系,确定它们关系的亲疏。它主要根据两个指标统计商品之间的亲和性:
支持度:支持度指的是数据集中规则应验的次数。(商品交易中同时买A商品和B商品的交易数量【支持度也可以为次数/交易总量】)
置信度:置信度代表的是规则的准确性如何。(既买A商品又买B商品的数量除以买A商品的数量)
首先,我们使用Pandas读取数据集:
import pandas as pd
from itertools import combinations
features = ["bre
FP
-g
row
th
算法
在py
th
on中的实现,代码亲测可用,如果有
类似:'ascii' codec can't decode byte 0xe8 in position 0 的报错,请修改
fp
g
row
th
.py中的Creat
FP
tree中的下面两种:
orderedItem = [v[0] for v in sorted(localD.iteritems(), key=lambda p:(p[1], -ord(p[0])), reverse=True)]
# orderedItem = [v[0] for v in sorted(localD.iteritems(), key=lambda p:(p[1], int(p[0])), reverse=True)]
FP
-g
row
th
算法
基于Apriori构建,但是采取高级的数据结构减少扫描次数,它只需要对数据库进行两次扫描,而Apriori
算法
对于每个潜在的频繁项集都会扫描数据集判定给定模式是否频繁,因此
Fp
-g
row
th
算法
速度比起Apriori
算法
要快很多。
Fp
-g
row
th
算法
挖掘频繁项集的基本过程如下:
(1)构建
FP
树
(2)从
FP
树中挖掘频繁项集
FP
-g
row
th
算法
将数据存储在一种称为
FP
树的紧凑数据结构中,
FP
代表频繁模式。图11-5给出一个
FP
树的例子