# 粗糙集相对属性约简python代码实现

1.为了简化计算数据难度，将一个大的数据集切割为多个小的数据集进行计算，可以使数据计算时间大大降低。（未分割时，2w行数据本人笔记本跑数据跑了9h，切割后计算计算时间为2-3min，中间的循环嵌套太多了的原因）
2.为了使数据更有代表性，将一个大的数据集切割为多个小的数据集，然后相加得到各个数据集一起得到的数据，一定程度上降低干扰。

``````# _*_coding:utf-8 _*_
#@Time    :2018/11/28 下午7:21
#@Author  :we2swing
#@FileName: Rough_Set.py
# -*- coding:UTF-8 -*-

import pandas as pd
import time

def basic_set(df):
basic = {}
for i in df.drop_duplicates().values.tolist():
basic[str(i)] = []
for j, k in enumerate(df.values.tolist()):
if k == i:
basic[str(i)].append(j)

return basic

def rough_set(data):
data = data.dropna(axis=0, how='any')
x_data = data.drop(['judge'], axis=1)
y_data = data.loc[:, 'judge']
# 决策属性基本集
y_basic_set = sorted([v for k, v in basic_set(y_data).items()])
# 条件属性基本集
x_basic_set = sorted([v for k, v in basic_set(x_data).items()])
pos = []
for i in x_basic_set:
for j in y_basic_set:
if set(i).issubset(j):
pos.append(i)
pos.sort()
print("x_basic_set",x_basic_set)
print("y_basic_set",y_basic_set)
print ('y的x正域Pos_x(y): ',pos)
r_x_y = len([k for i in pos for k in i]) / (len(data))
# print('依赖度r_x_(y):', r_x_y)

# 探索条件属性中不可省关系
u = locals()
pos_va = locals()
r = locals()
columns_num = list(range(len(x_data.columns)))
# 收集核
imp_core = []
# 收集属性重要性
imp_attr = []
for i in columns_num:
c = columns_num.copy()
c.remove(i)
u = data.iloc[:, c]
u = sorted([v for k, v in basic_set(u).items()])
pos_va = []
for k in u:
for j in y_basic_set:
if set(k).issubset(j):
pos_va.append(k)
if sorted(pos_va) != pos:
imp_core.append(i)
r = len(sorted(pos_va)) / len(data)
r_diff = round(r_x_y - r, 4)

imp_attr.append(r_diff)

dict_imp = {}
for o, p in enumerate(imp_attr):
dict_imp[data.columns[o]] = p

result = dict_imp
sorted_dict_imp = sorted(dict_imp, key=lambda x: dict_imp[x], reverse=True)
sorted_dict_imp = list(map(lambda x: {x: dict_imp[x]}, sorted_dict_imp))
imp_core = [data.columns[i] for i in imp_core]

print('属性重要度为:', sorted_dict_imp)
# print('核属性为：', imp_core)#核属性仅供参考

return result

def deal(data):
#获取数据长度
len = data.iloc[:,0].size
#将数据划分
if len%500 != 0:
if len > 500:
num = len//500+1
else:
num = 1
else:
if len > 500:
num = int(len/500)
else:
num = 1
arr = [[]]*num

count = 0
for i in arr:
#如果数少于500或者最后一部分数少于500，则放入一个由数长决定的数组
if num == 1:
arr[count] = data.iloc[0:len]#取500开始，取
elif count == num - 1:
arr[count] = data.iloc[500 * count:len]
else:
arr[count] = data.iloc[500 * count:(count + 1) * 500]
count = count + 1
sorted_dict_imp = [[]]*num
total = [0]*27
title = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27']
# total = [0] * 16
# title = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16']
count = 0
for i in arr:
print('-------------------------------------第%d个数据集数据-----------------------------------------'%(count+1))
sorted_dict_imp[count] = rough_set(i)
count = count + 1
count1 = 0
# 将dict的key为C1-Cn的value存入total中保存,并且相加
for i in sorted_dict_imp:
count = 0
if count1 == 0:
for j in title:
total[count] = i.get(j)
count = count + 1
else:
for z in title:
total[count] = i.get(z) + total[count]
count = count + 1
count1 = count1 + 1
#输出最终C1-Cn的结果
count = 0
for i in title:
print(i,':',round(total[count],4))
count = count + 1

def main():
time1 = time.time()
#读取文件数据
deal(data)
time2 = time.time()
print(time2-time1)

if __name__ == '__main__':
main()
``````

rough_set是属性约简函数，要明白该函数的可以参考我的一篇转载文章传送门
judge列为决策属性列 C1-C27为关系属性

data1.csv