为了让程序能够读取训练样本的数据,我们将样本文字描述修改为数字:
| 色泽 | 敲声 | 纹理 | 好瓜 |
| 0 | 0 | 0 | 1 |
| 1 | 1 | 0 | 1 |
| 1 | 0 | 0 | 1 |
| 0 | 1 | 0 | 1 |
| 2 | 0 | 0 | 1 |
| 1 | 1 | 1 | 0 |
| 0 | 2 | 0 | 0 |
| 2 | 2 | 1 | 0 |
| 2 | 0 | 1 | 0 |
| 0 | 0 | 1 | 0 |
然后把数据做成训练样本读入程序。
//决策树
typedef struct s_dtree
{
//当前节点样本
int* sample_x;
//样本数
int sample_c;
//特征数
int sample_f;
//当前特征号
int sample_fi;
//当前特征值
int sample_v;
//样本索引
int sample_ind;
//分类值
int sample_y;
//孩子节点
struct s_dtree *child;
//兄弟节点
struct s_dtree *sibling;
} s_dtree;
//链表
typedef struct s_slist
{
//关键字值
int key;
//关键字出现的次数
int count;
//下一节点
struct s_slist* next;
} s_slist;
//决策树根节点
s_dtree *root = NULL;
//计算关键字出现的次数
int slist_key_count(s_slist* list, int key)
{
if (list == NULL)
{
return -1;
}
//头节点
if (list->key == key)
{
list->count++;
return 0;
}
s_slist *p = list;
//查找关键字
while (p->next != NULL)
{
//关键字相同的节点
if (p->next->key == key)
{
//出现个数加1
p->next->count++;
return 0;
}
p = p->next;
}
//如果没找到关键字,创建一个新节点
s_slist *pnew = malloc(sizeof(s_slist));
pnew->key = key;
pnew->count = 1;
pnew->next = NULL;
p->next = pnew;
return 0;
}
//取得关键字出现的次数
int slist_get_count(s_slist *list, int key)
{
s_slist *p = list;
while (p != NULL)
{
if (p->key == key)
{
return p->count;
}
p = p->next;
}
return 0;
}
//释放内存资源
int slist_free(s_slist *list)
{
s_slist *p = list;
while (p != NULL)
{
s_slist *d = p;
p = p->next;
free(d);
}
return 0;
}
//计算香农熵
double dt_ent_value(s_slist *feature, int csample)
{
if (feature == NULL)
{
return 0;
}
//计算特征每一种取值的概率
double px = (double) feature->count / (double) csample;
//递归计算香农熵
return -px * log2(px) + dt_ent_value(feature->next, csample);
}
//计算每一个特征的香农熵,并返回熵最大的特征号
int dt_ent(s_slist *features, int* sample_x, int csample, int cfeature)
{
//最大香农熵
double max_ent = 0;
//最大熵的特征
int max_feature = 0;
//计算每一个特征的香农熵
for (int j = 0; j < cfeature; j++)
{
//计算特征每一种取值的个数
for (int i = 0; i < csample; i++)
{
//链表头
if (i == 0)
{
//特征取值
features[j].key = sample_x[i * cfeature + j];
//出现个数
features[j].count = 1;
features[j].next = NULL;
continue;
}
//计算特征每一种取值的个数
slist_key_count(&features[j], sample_x[i * cfeature + j]);
}
//计算香农熵
double ent = dt_ent_value(&features[j], csample);
//找到最大香农熵
if (ent > max_ent)
{
max_ent = ent;
//等到具有最大香农熵的特征号
max_feature = j;
}
}
//返回具有最大香农熵的特征号
return max_feature;
}
//构建决策树的节点
int dt_create_node(s_dtree *node, int *sample_x, int row, s_slist *features, int feature, int count_feature)
{
if (node == NULL)
{
return -1;
}
//取得当前样本特征值
int value = sample_x[row * count_feature + feature];
//如果与当前节点值相同
if (node->sample_v == value)
{
//复制样本数据到节点中的样本子集
for (int j = 0, k = 0; j < count_feature; j++)
{
if (j != feature)
{
node->sample_x[node->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j];
k++;
}
}
//样本索引号加1
node->sample_ind++;
return 0;
}
s_dtree *p = node;
//查找当前节点的兄弟节点
while (p->sibling != NULL)
{
//如果兄弟节点值相同
if (p->sibling->sample_v == value)
{
//复制样本数据到节点中的样本子集
for (int j = 0, k = 0; j < count_feature; j++)
{
if (j != feature)
{
p->sibling->sample_x[p->sibling->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j];
k++;
}
}
//样本索引号加1
p->sibling->sample_ind++;
return 0;
}
p = p->sibling;
}
//如果没有找到值相同的节点,则创建一个新节点
s_dtree *pnew = malloc(sizeof(s_dtree));
//样本特征值
pnew->sample_v = sample_x[row * count_feature + feature];
//具有相同值的样本数
int s_count = slist_get_count(&features[feature], pnew->sample_v);
//样本数
pnew->sample_c = s_count;
//样本索引
pnew->sample_ind = 0;
//样本子集
pnew->sample_x = malloc(sizeof(int) * s_count * (count_feature - 1));
for (int j = 0, k = 0; j < count_feature; j++)
{
if (j != feature)
{
pnew->sample_x[pnew->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j];
k++;
}
}
//样本索引号加1
pnew->sample_ind++;
//特征数
pnew->sample_f = count_feature - 1;
//当前节点特征号
pnew->sample_fi = feature;
//分类鸮
pnew->sample_y = -1;
pnew->child = NULL;
pnew->sibling = NULL;
//加入新节点
p->sibling = pnew;
return 0;
}
//创建决策树
s_dtree* dt_create(s_dtree *p_node, int* sample_x, int csample, int cfeature)
{
if (sample_x == NULL)
{
return NULL;
}
//训练样本的特征等于1,说明只有最后一个特征,即分类号
if (cfeature <= 1)
{
//计算分类号出现的次数
s_slist *ps = malloc(sizeof(s_slist));
for (int i = 0; i < csample; i++)
{
if (i == 0)
{
ps->key = sample_x[i * cfeature];
ps->count = 1;
ps->next = NULL;
continue;
}
//计算分类号出现的次数
slist_key_count(ps, sample_x[i * cfeature]);
}
//找出出现次数最多的分类
s_slist *p = ps;
int max_count = 0;
int max_y = 0;
while (p != NULL)
{
if (p->count > max_count)
{
max_count = p->count;
max_y = p->key;
}
p = p->next;
}
slist_free(ps);
//得到分类号
p_node->sample_y = max_y;
return NULL;
}
//当前等待处理的节点
s_dtree *node = NULL;
//特征数组,为计算出现样本数、香农熵用
s_slist *features = malloc(sizeof(s_slist) * cfeature);
//计算香农熵
int feature = dt_ent(features, sample_x, csample, cfeature);
//将样本按不用的取值划分到不用节点上,每一个节点即为一个数据划分子集
for (int i = 0; i < csample; i++)
{
if (i == 0)
{
//新节点
node = malloc(sizeof(s_dtree));
//样本特征值
node->sample_v = sample_x[i * cfeature + feature];
//样本子集数量
int s_count = slist_get_count(&features[feature], node->sample_v);
//样本子集数量
node->sample_c = s_count;
//样本子集索引号
node->sample_ind = 0;
//样本子集
node->sample_x = malloc(sizeof(int) * s_count * (cfeature - 1));
//复制样本数据到子集中
for (int j = 0, k = 0; j < cfeature; j++)
{
if (j != feature)
{
node->sample_x[node->sample_ind * cfeature + k] = sample_x[i * cfeature + j];
k++;
}
}
//样本子集索引号加1
node->sample_ind++;
//节点特征数
node->sample_f = cfeature - 1;
//节点当前特征号
node->sample_fi = feature;
//分类号
node->sample_y = -1;
node->child = NULL;
node->sibling = NULL;
//如果父节点为空
if (p_node == NULL)
{
//即为根节点
root = node;
continue;
}
//将新节点挂接到父节点上
p_node->child = node;
continue;
}
//根据训练样本划分子集,创建节点
dt_create_node(node, sample_x, i, features, feature, cfeature);
}
//释放资源
slist_free(features);
//递归处理所有节点
s_dtree *p = node;
while (p != NULL)
{
//递归创建决策树
dt_create(p, p->sample_x, p->sample_c, cfeature - 1);
p = p->sibling;
}
return node;
}
//释放内存资源
int dt_free(s_dtree *node)
{
s_dtree *p = node;
if (p == NULL)
{
return 0;
}
s_dtree *d = p;
dt_free(p->sibling);
dt_free(p->child);
if (d->sample_x != NULL)
{
//释放样本内存
free(d->sample_x);
}
//释放树节点内存
free(d);
return 0;
}
//使用决策树分类
int dt_classify(s_dtree *node, int *sample_x)
{
if (node == NULL)
{
return -1;
}
//取得样本在当前节点中特征号的值
int v = sample_x[node->sample_fi];
//如果是最底层节点,特征号为1,即分类
if (node->sample_f == 1)
{
s_dtree *p = node;
//如果样本特征值与节点样本特征值相同
if (node->sample_v == v)
{
//返回分类号
return node->sample_y;
}
//递归使用决策树分类用其兄弟节点分类
return dt_classify(node->sibling, sample_x);
}
//如果样本特征值与节点样本特征值相同
if (node->sample_v == v)
{
//新样本,去掉当前特征
int sample_target[node->sample_f];
//复制数据到新样本
for (int i = 0, k = 0; i < node->sample_f; i++)
{
//去掉当前特征
if (i != node->sample_fi)
{
sample_target[k] = sample_x[i];
k++;
}
}
//递归子节点
return dt_classify(node->child, sample_target);
}
//递归兄弟节点
return dt_classify(node->sibling, sample_x);
}
Copyright © 2015-2023 问渠网 辽ICP备15013245号