决策树对西瓜数据集2.0二分类

西瓜数据集.jpg

@生成分类字典

# -*- coding: UTF-8 -*- 

#设置默认编码，否则中文会乱码
import sys 
reload(sys) 
sys.setdefaultencoding('utf-8') 
from math import log

#1、获取样例集和属性列表
def filetodataset(filename):   
    fr=open(filename,'r')
    all_lines=fr.readlines()   #list形式,每行为1个str
    featname=all_lines[0].strip().split(',')  #list形式
    featname=featname[:-1]
    dictcategory={}
    dataset=[]
    for sample in all_lines[1:]:
        sample=sample.strip().split(',')   #以逗号为分割符拆分列表
        dataset.append(sample)
    return dataset,featname

#2、计算香农商
def calcent(dataset):
    dictcategory={}
    for i in dataset:
        category=i[-1]
        if category not in dictcategory:
            dictcategory[category]=0
        dictcategory[category]+=1
    num=len(dataset)
    shannon=0
    for i in dictcategory:
        prob=float(dictcategory[i])/num
        shannon-=prob*log(prob,2)
    return shannon

#3、对特定属性选择特定取值后，将满足该条件的剩余数据集组合留待计算香农商
def splitdataset(dataset,axis,value):
    subdataset=[]
    for sample in dataset:
        if sample[axis]==value:
            reducedfeatvec=sample[:axis]
            reducedfeatvec.extend(sample[axis+1:])
            subdataset.append(reducedfeatvec)
    return subdataset

#4、选择最佳的划分属性
def choosebestfeaturetosplit(dataset):
    attrnum=len(dataset[0])     #计算属性个数
    baseshannon=calcent(dataset) #计算整个样本集的香农商
    bestinfogain=0.0 ; bestfeature=-1
    for i in range(attrnum-1):
        featlist=[example[i] for example in dataset]  #取出特定属性的所有值。dataset包含了类，但不影响，因为取不到
        unifeat=set(featlist)   #每个属性所含的值
        attrshannon=0
        for value in unifeat:
            subdataset=splitdataset(dataset,i,value)
            shannon=calcent(subdataset)  #每个属性值取每个值的香农商
            prob=len(subdataset)/float(len(dataset))
            attrshannon+=prob*shannon
        infogain=baseshannon-attrshannon
        if infogain>bestinfogain:
            bestinfogain=infogain
            bestfeature=i
    return bestfeature


#5、返回样例中类最多的那个类别
def majorclass(data):
    aa=[sample[-1] for sample in data]   #获取每个样例最后的类别
    bb={}
    for i in aa:
        bb[i]=aa.count(i)
    #将字典bb降序排列，书中用的另一种方式
    bb= sorted(bb.iteritems(), key=lambda d:d[1], reverse = True)
    return bb


#6、生成决策树
def createtree(mydata,labels):  #labels为属性标签
    #情况1、当所有样例的类别一致时，返回类别
    samplelabel=[sample[-1] for sample in mydata]
    usamplelabel=list(set(samplelabel))
    if len(usamplelabel)==1:
        return usamplelabel[0]

    #情况2、当属性已经用完，则选择类别最多的显示
    if len(mydata[0])==1:
        return majorclass(mydata)

    #情况3：选择最佳划分属性进行划分
    bestfeature=choosebestfeaturetosplit(mydata)
    bestfeaturelabel=labels[bestfeature]
    mytree={bestfeaturelabel:{}}
    del labels[bestfeature]

    featurevalue=[sample[bestfeature] for sample in mydata]
    ufeaturevalue=set(featurevalue)
    for value in ufeaturevalue:
        sublabels=labels[:]
        mytree[bestfeaturelabel][value]=createtree(splitdataset(mydata,bestfeature,value),sublabels)
    return mytree


if __name__=='__main__':
    import json
    filename='/Users/enniu/Desktop/jqxx/xiguaset.txt'
    mydata,featname=filetodataset(filename)
    #shannon=calcent(mydata)
    #choosebestfeaturetosplit(mydata)
    mytree=createtree(mydata,featname)
    print json.dumps(mytree, ensure_ascii=False)   #直接打印字典，里面含有中文，控制台信息输出窗口按照ascii编码输出utf8编码的字符串。

结果如下：

{"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
说明
1、在结点上下游（递归）属性只出现一次，因为后面算法会剔除掉。同个属性可能出现在不同分叉路

2、与机器学习书相比P78，少了个色泽浅白为好瓜的判断

参考：
如何实现并应用决策树算法？
python 字典中有中文写入文件后变成编码

@绘制树形图


# -*- coding:utf-8 -*-

import sys 
reload(sys) 
sys.setdefaultencoding('utf-8')
import matplotlib.pyplot as plt
import json
#mytree={"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
anothertree={'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
#anothertree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}
#print json.dumps(mytree,ensure_ascii=False)

#计算叶节点数目
def calculateleaf(mytree):
    numleaf=0
    firststr=mytree.keys()[0]  #获取字典第一个键值
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        if type(seconddict[key]).__name__=='dict':
            numleaf+= calculateleaf(seconddict[key])
        else:
            numleaf+=1
    return numleaf

#计算数的层数
def calculatedepth(mytree):
    maxdepth=0
    firststr=mytree.keys()[0]
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        #print key,
        if type(seconddict[key]).__name__=='dict':
            numdepth=1+calculatedepth(seconddict[key])
        else:
            numdepth=1   #到叶节点后，计算树深度的变量+1
        if numdepth>maxdepth:
            maxdepth=numdepth
        #print numdepth,maxdepth
    return maxdepth

def plotmidtext(cntrpt,parentpt,txtstring):
    xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0]
    ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1]
    createplot.ax1.text(xmid,ymid,txtstring)

decisionnode=dict(boxstyle="sawtooth",fc="0.8")
leafnode=dict(boxstyle="round4",fc="0.8")
arrow_args=dict(arrowstyle="<-")

def plotnode(nodetext,centerpt,parentpt,nodetype):
    createplot.ax1.annotate(nodetext,xy=parentpt,xytext=centerpt,arrowprops=arrow_args,\
        xycoords='axes fraction',va='center',ha='center',bbox=nodetype)


def plottree(mytree,parentpt,nodetxt):
    numleafs=calculateleaf(mytree)
    depth=calculatedepth(mytree)
    firststr=mytree.keys()[0]
    cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff)
    print '子节点坐标:',cntrpt
    plotmidtext(cntrpt,parentpt,nodetxt)  #自定义函数
    plotnode(firststr,cntrpt,parentpt,decisionnode) #刚开始根节点与子节点是连在一起的？
    print '绘制连接箭头',cntrpt,parentpt
    seconddict=mytree[firststr]
    plottree.yoff=plottree.yoff-1.0/(1.0*plottree.totald) #控制宽度
    print 'y轴值:',plottree.yoff
    for key in seconddict.keys():
        if type(seconddict[key]).__name__=='dict':
            print '***sandy***',plottree.xoff  #经过else的判断后已变为1/6
            plottree(seconddict[key],cntrpt,str(key))
            print '***lam***',plottree.xoff
        else:
            plottree.xoff=plottree.xoff+1.0/plottree.totalw
            plotnode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafnode)
            print '灯灯hoho',(plottree.xoff,plottree.yoff),cntrpt
            plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key))
    #plottree.yoff=plottree.yoff+1.0/plottree.totald

def createplot(intree):
    fig=plt.figure(1,facecolor='white')
    fig.clf()
    axprops=dict(xticks=[0,0.2,0.4,0.6,0.8,1],yticks=[0,0.2,0.4,0.6,0.8,1])
    createplot.ax1=plt.subplot(111,frameon=True,**axprops)  #把**axprops去掉亦可，默认显示刻度
    plottree.totalw=float(calculateleaf(intree))
    plottree.totald=float(calculatedepth(intree))
    plottree.xoff=-0.5/plottree.totalw
    plottree.yoff=1.0
    plottree(intree,(0.5,1.0),'')
    plt.show()

if __name__=='__main__':
    createplot(anothertree)

@@递归探讨

当碰到递归时，沿着递归执行到最终结果（即最后停止递归的地方），然后再依次往上层执行

# -*- coding: UTF-8 -*- 
def calculatedepth(mytree):
    maxdepth=0
    firststr=mytree.keys()[0]
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        print key
        if type(seconddict[key]).__name__=='dict':
            print '**'
            numdepth=1+calculatedepth(seconddict[key])
            print '第1种情况',numdepth
        else:
            numdepth=1   #到叶节点后，计算树深度的变量+1
            print '第2种情况',numdepth
        if numdepth>maxdepth:
            maxdepth=numdepth
        print (numdepth,maxdepth)
    return maxdepth

mytree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}

if __name__=='__main__':
    a=calculatedepth(mytree)

隐形眼镜数据集.png

最后编辑于：2017.12.06 09:47:13

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 157,298评论 4赞 360
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 66,701评论 1赞 290
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 107,078评论 0赞 237
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 43,687评论 0赞 202
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 52,018评论 3赞 286
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 40,410评论 1赞 211
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 31,729评论 2赞 310
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 30,412评论 0赞 194
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 34,124评论 1赞 239
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 30,379评论 2赞 242
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 31,903评论 1赞 257
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 28,268评论 2赞 251
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 32,894评论 3赞 233
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 26,014评论 0赞 8
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 26,770评论 0赞 192
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 35,435评论 2赞 269
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 35,312评论 2赞 260

决策树对西瓜数据集2.0二分类

推荐阅读更多精彩内容