if
__name__
==
'
__main__
'
:
a
= [(1, 2, 5), (3, 4, 5), (2, 3, 4
)]
df
= pd.DataFrame(data=a, columns=[
"
a
"
,
'
b
'
,
'
c
'
])
a2
= [(1, 2, 5), (3, 9, 5), (4, 3, 4
)]
df2
= pd.DataFrame(data=a2, columns=[
"
a
"
,
'
b
'
,
'
c
'
])
print
(df)
print
(df2)
print
(
"
------------------
"
)
c
= datacompy.Compare(df, df2, join_columns=[
'
a
'
])
#
join_columns 类似唯一键,会根据这些列join
print
(
"
df1独有的行:
"
)
print
(c.df1_unq_rows)
print
(
"
df2独有的行:
"
)
print
(c.df2_unq_rows)
print
(
"
df1独有的列:
"
)
print
(c.df1_unq_columns())
print
(
"
df2独有的列:
"
)
print
(c.df2_unq_columns())
print
(
"
不匹配的行:
"
)
print
(c.all_mismatch())
print
(
"
列名是否匹配
"
)
print
(c.all_columns_match())
print
(
"
是否匹配
"
)
print
(c.matches())
print
(
"
两个表的行是否全部一一对应
"
)
print
(c.all_rows_overlap())
#
根据join_columns确定一个行
print
(
"
共有列
"
)
print
(c.intersect_columns())
print
(
"
共有行
"
)
print
(c.intersect_rows)
print
(
"
共有行是否匹配
"
)
print
(c.intersect_rows_match())
print
(
"
列比较信息
"
)
print
(c.column_stats)
print
(
"
某一列差异
"
)
print
(c.sample_mismatch(
'
b
'
))
#
好像有bug
执行结果:
a b c
0 1 2 5
1 3 4 5
2 2 3 4
a b c
0 1 2 5
1 3 9 5
2 4 3 4
------------------
df1独有的行:
a b c
2 2 3.0 4.0
df2独有的行:
a b c
3 4 3.0 4.0
df1独有的列:
set()
df2独有的列:
set()
不匹配的行:
a b_df1 b_df2 c_df1 c_df2
1 3 4.0 9.0 5.0 5.0
列名是否匹配
False
两个表的行是否全部一一对应
False
{'b', 'a', 'c'}
a b_df1 c_df1 b_df2 c_df2 _merge b_match c_match
0 1 2.0 5.0 2.0 5.0 both True True
1 3 4.0 5.0 9.0 5.0 both False True
共有行是否匹配
False
列比较信息
[{'unequal_cnt': 1, 'all_match': False, 'null_diff': 0, 'dtype1': 'int64', 'match_cnt': 1, 'max_diff': 5.0, 'dtype2': 'int64', 'match_column': 'b_match', 'column': 'b'}, {'unequal_cnt': 0, 'all_match': True, 'null_diff': 0, 'dtype1': 'int64', 'match_cnt': 2, 'max_diff': 0, 'dtype2': 'int64', 'match_column': '', 'column': 'a'}, {'unequal_cnt': 0, 'all_match': True, 'null_diff': 0, 'dtype1': 'int64', 'match_cnt': 2, 'max_diff': 0.0, 'dtype2': 'int64', 'match_column': 'c_match', 'column': 'c'}]
某一列差异
a b_df1 b_df2
1 3 4.0 9.0