为了让程序能够读取训练样本的数据,我们将样本文字描述修改为数字:
色泽 | 敲声 | 纹理 | 好瓜 |
0 | 0 | 0 | 1 |
1 | 1 | 0 | 1 |
1 | 0 | 0 | 1 |
0 | 1 | 0 | 1 |
2 | 0 | 0 | 1 |
1 | 1 | 1 | 0 |
0 | 2 | 0 | 0 |
2 | 2 | 1 | 0 |
2 | 0 | 1 | 0 |
0 | 0 | 1 | 0 |
然后把数据做成训练样本读入程序。
//决策树 typedef struct s_dtree { //当前节点样本 int* sample_x; //样本数 int sample_c; //特征数 int sample_f; //当前特征号 int sample_fi; //当前特征值 int sample_v; //样本索引 int sample_ind; //分类值 int sample_y; //孩子节点 struct s_dtree *child; //兄弟节点 struct s_dtree *sibling; } s_dtree; //链表 typedef struct s_slist { //关键字值 int key; //关键字出现的次数 int count; //下一节点 struct s_slist* next; } s_slist; //决策树根节点 s_dtree *root = NULL; //计算关键字出现的次数 int slist_key_count(s_slist* list, int key) { if (list == NULL) { return -1; } //头节点 if (list->key == key) { list->count++; return 0; } s_slist *p = list; //查找关键字 while (p->next != NULL) { //关键字相同的节点 if (p->next->key == key) { //出现个数加1 p->next->count++; return 0; } p = p->next; } //如果没找到关键字,创建一个新节点 s_slist *pnew = malloc(sizeof(s_slist)); pnew->key = key; pnew->count = 1; pnew->next = NULL; p->next = pnew; return 0; } //取得关键字出现的次数 int slist_get_count(s_slist *list, int key) { s_slist *p = list; while (p != NULL) { if (p->key == key) { return p->count; } p = p->next; } return 0; } //释放内存资源 int slist_free(s_slist *list) { s_slist *p = list; while (p != NULL) { s_slist *d = p; p = p->next; free(d); } return 0; } //计算香农熵 double dt_ent_value(s_slist *feature, int csample) { if (feature == NULL) { return 0; } //计算特征每一种取值的概率 double px = (double) feature->count / (double) csample; //递归计算香农熵 return -px * log2(px) + dt_ent_value(feature->next, csample); } //计算每一个特征的香农熵,并返回熵最大的特征号 int dt_ent(s_slist *features, int* sample_x, int csample, int cfeature) { //最大香农熵 double max_ent = 0; //最大熵的特征 int max_feature = 0; //计算每一个特征的香农熵 for (int j = 0; j < cfeature; j++) { //计算特征每一种取值的个数 for (int i = 0; i < csample; i++) { //链表头 if (i == 0) { //特征取值 features[j].key = sample_x[i * cfeature + j]; //出现个数 features[j].count = 1; features[j].next = NULL; continue; } //计算特征每一种取值的个数 slist_key_count(&features[j], sample_x[i * cfeature + j]); } //计算香农熵 double ent = dt_ent_value(&features[j], csample); //找到最大香农熵 if (ent > max_ent) { max_ent = ent; //等到具有最大香农熵的特征号 max_feature = j; } } //返回具有最大香农熵的特征号 return max_feature; } //构建决策树的节点 int dt_create_node(s_dtree *node, int *sample_x, int row, s_slist *features, int feature, int count_feature) { if (node == NULL) { return -1; } //取得当前样本特征值 int value = sample_x[row * count_feature + feature]; //如果与当前节点值相同 if (node->sample_v == value) { //复制样本数据到节点中的样本子集 for (int j = 0, k = 0; j < count_feature; j++) { if (j != feature) { node->sample_x[node->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j]; k++; } } //样本索引号加1 node->sample_ind++; return 0; } s_dtree *p = node; //查找当前节点的兄弟节点 while (p->sibling != NULL) { //如果兄弟节点值相同 if (p->sibling->sample_v == value) { //复制样本数据到节点中的样本子集 for (int j = 0, k = 0; j < count_feature; j++) { if (j != feature) { p->sibling->sample_x[p->sibling->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j]; k++; } } //样本索引号加1 p->sibling->sample_ind++; return 0; } p = p->sibling; } //如果没有找到值相同的节点,则创建一个新节点 s_dtree *pnew = malloc(sizeof(s_dtree)); //样本特征值 pnew->sample_v = sample_x[row * count_feature + feature]; //具有相同值的样本数 int s_count = slist_get_count(&features[feature], pnew->sample_v); //样本数 pnew->sample_c = s_count; //样本索引 pnew->sample_ind = 0; //样本子集 pnew->sample_x = malloc(sizeof(int) * s_count * (count_feature - 1)); for (int j = 0, k = 0; j < count_feature; j++) { if (j != feature) { pnew->sample_x[pnew->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j]; k++; } } //样本索引号加1 pnew->sample_ind++; //特征数 pnew->sample_f = count_feature - 1; //当前节点特征号 pnew->sample_fi = feature; //分类鸮 pnew->sample_y = -1; pnew->child = NULL; pnew->sibling = NULL; //加入新节点 p->sibling = pnew; return 0; } //创建决策树 s_dtree* dt_create(s_dtree *p_node, int* sample_x, int csample, int cfeature) { if (sample_x == NULL) { return NULL; } //训练样本的特征等于1,说明只有最后一个特征,即分类号 if (cfeature <= 1) { //计算分类号出现的次数 s_slist *ps = malloc(sizeof(s_slist)); for (int i = 0; i < csample; i++) { if (i == 0) { ps->key = sample_x[i * cfeature]; ps->count = 1; ps->next = NULL; continue; } //计算分类号出现的次数 slist_key_count(ps, sample_x[i * cfeature]); } //找出出现次数最多的分类 s_slist *p = ps; int max_count = 0; int max_y = 0; while (p != NULL) { if (p->count > max_count) { max_count = p->count; max_y = p->key; } p = p->next; } slist_free(ps); //得到分类号 p_node->sample_y = max_y; return NULL; } //当前等待处理的节点 s_dtree *node = NULL; //特征数组,为计算出现样本数、香农熵用 s_slist *features = malloc(sizeof(s_slist) * cfeature); //计算香农熵 int feature = dt_ent(features, sample_x, csample, cfeature); //将样本按不用的取值划分到不用节点上,每一个节点即为一个数据划分子集 for (int i = 0; i < csample; i++) { if (i == 0) { //新节点 node = malloc(sizeof(s_dtree)); //样本特征值 node->sample_v = sample_x[i * cfeature + feature]; //样本子集数量 int s_count = slist_get_count(&features[feature], node->sample_v); //样本子集数量 node->sample_c = s_count; //样本子集索引号 node->sample_ind = 0; //样本子集 node->sample_x = malloc(sizeof(int) * s_count * (cfeature - 1)); //复制样本数据到子集中 for (int j = 0, k = 0; j < cfeature; j++) { if (j != feature) { node->sample_x[node->sample_ind * cfeature + k] = sample_x[i * cfeature + j]; k++; } } //样本子集索引号加1 node->sample_ind++; //节点特征数 node->sample_f = cfeature - 1; //节点当前特征号 node->sample_fi = feature; //分类号 node->sample_y = -1; node->child = NULL; node->sibling = NULL; //如果父节点为空 if (p_node == NULL) { //即为根节点 root = node; continue; } //将新节点挂接到父节点上 p_node->child = node; continue; } //根据训练样本划分子集,创建节点 dt_create_node(node, sample_x, i, features, feature, cfeature); } //释放资源 slist_free(features); //递归处理所有节点 s_dtree *p = node; while (p != NULL) { //递归创建决策树 dt_create(p, p->sample_x, p->sample_c, cfeature - 1); p = p->sibling; } return node; } //释放内存资源 int dt_free(s_dtree *node) { s_dtree *p = node; if (p == NULL) { return 0; } s_dtree *d = p; dt_free(p->sibling); dt_free(p->child); if (d->sample_x != NULL) { //释放样本内存 free(d->sample_x); } //释放树节点内存 free(d); return 0; } //使用决策树分类 int dt_classify(s_dtree *node, int *sample_x) { if (node == NULL) { return -1; } //取得样本在当前节点中特征号的值 int v = sample_x[node->sample_fi]; //如果是最底层节点,特征号为1,即分类 if (node->sample_f == 1) { s_dtree *p = node; //如果样本特征值与节点样本特征值相同 if (node->sample_v == v) { //返回分类号 return node->sample_y; } //递归使用决策树分类用其兄弟节点分类 return dt_classify(node->sibling, sample_x); } //如果样本特征值与节点样本特征值相同 if (node->sample_v == v) { //新样本,去掉当前特征 int sample_target[node->sample_f]; //复制数据到新样本 for (int i = 0, k = 0; i < node->sample_f; i++) { //去掉当前特征 if (i != node->sample_fi) { sample_target[k] = sample_x[i]; k++; } } //递归子节点 return dt_classify(node->child, sample_target); } //递归兄弟节点 return dt_classify(node->sibling, sample_x); }
Copyright © 2015-2023 问渠网 辽ICP备15013245号