#include "gradient_boosting.h"
#include <iostream>
#include <vector>
#include <fstream>
using namespace std;
int max_feature_label(string line)
{
int start = 0;
int fid;
int max_fid = 0;
int len = line.length();
for(int i = 0; i < len; i++) {
if(line[i] == ' ') {
start = i+1;
}
else if(line[i] == ':') {
if(sscanf(line.substr(start, i - start).c_str(), "%d", &fid) == 1) {
if(max_fid < fid) {
max_fid = fid;
}
}
}
}
return max_fid;
}
int splitline(string line, string items[], int items_num, const char separator)
{
if(items == NULL || items_num <= 0) {
return -1;
}
int n = line.length();
int j = 0;
int start = 0;
for(int i = 0; i < n; i++) {
if(line[i] == separator) {
if(j < items_num && start < n) {
items[j++] = line.substr(start, i-start);
start = ++i;
}
}
}
if(j < items_num && start < n) {
items[j++] = line.substr(start, n-start);
}
return j;
}
int gbdt_tree_node_split(
gbdt_info_t gbdt_inf,
bufset* data_set,
double *x_fea_value,
double *y_score,
nodeinfo ninf,
int* index,
splitinfo* spinf)
{
if(data_set == NULL || x_fea_value == NULL || y_score == NULL || index == NULL || spinf == NULL)
{
LOG_ERROR_("Parameter error.");
return -1;
}
spinf->bestsplit = -1;
spinf->bestid = -1;
for (int i=0; i < gbdt_inf.fea_num; ++i)
{
data_set->fea_pool[i] = i;
}
int last = gbdt_inf.fea_num - 1;
double critmax = 0.0;
for (int i = 0; i < gbdt_inf.rand_fea_num; ++i)
{
//debug
//int select = last;
int select = rand() % (last+1);
int fid = data_set->fea_pool[select]; // fid = 蕞磨d 0 - max_id
data_set->fea_pool[select] = data_set->fea_pool[last];
data_set->fea_pool[last] = fid;
last--;
for (int j = ninf.index_b; j <= ninf.index_e; j++){
data_set->fvalue_list[j] = x_fea_value[index[j]* gbdt_inf.fea_num + fid];
data_set->fv[j] = data_set->fvalue_list[j];
data_set->y_list[j] = y_score[index[j]];
}
for (int j = 0; j < gbdt_inf.sample_num; ++j) {
data_set->order_i[j] = j;
}
R_qsort_I(data_set->fv, data_set->order_i, ninf.index_b+1 , ninf.index_e+1);
// debug
//for (int s = 0; s<300; s++)
//{
// printf("%d==%lf_%d\n", s, data_set->fv[s],data_set->order_i[s]);
//}
if (data_set->fv[ninf.index_b] >= data_set->fv[ninf.index_e]) {
continue; //
}
double left_sum = 0.0;
int left_num = 0;
double right_sum = ninf.nodesum;
int right_num = ninf.nodenum;
double d = 0.0;
double crit = 0.0;
double tmpsplit = 0;
double critvar = 0;
for (int j=ninf.index_b; j< ninf.index_e; j++)
{
// d = y_result_score[data_set->order_i[j]];
d = data_set->y_list[data_set->order_i[j]];
left_sum += d;
right_sum -= d;
left_num++;
right_num--;
if (data_set->fv[j] < data_set->fv[j+1]) {
crit = (left_sum * left_sum / left_num) + (right_sum * right_sum / right_num) - ninf.critparent;
if (crit > critvar) {
// 蕞蕞蕞 feature value
tmpsplit = (data_set->fv[j] + data_set->fv[j+1]) / 2.0;
critvar = crit;
}
}
}
// 蕞蕞feature蕞� critvar > cirtmax, 蕞蕞
if (critvar > critmax) {
spinf->bestsplit = tmpsplit; // split feature vale
spinf->bestid = fid; // split feature id
critmax = critvar; // split crit vaule
}
}
// 蕞bestid 蕞node蕞磨ndex
if( spinf->bestid >= 0)
{
int nl = ninf.index_b;
for (int j= ninf.index_b; j<= ninf.index_e; j++)
{
if (x_fea_value[index[j]* gbdt_inf.fea_num + spinf->bestid] <= spinf->bestsplit)
{
data_set->order_i[nl] = index[j]; //update data->set
nl++;
}
}
int nr = nl;
for (int j= ninf.index_b; j<= ninf.index_e; j++)
{
if (x_fea_value[index[j]* gbdt_inf.fea_num + spinf->bestid] > spinf->bestsplit)
{
data_set->order_i[nr] = index[j];
nr++;
}
}
for (int j= ninf.index_b; j<= ninf.index_e; j++)
{
index[j] = data_set->order_i[j];
}
spinf->pivot = nl;
return 0;
}
else
{
return 1;
}
}
int gbdt_single_tree_estimation(
double *x_fea_value,
double *y_gradient,
gbdt_info_t gbdt_inf,
bufset* data_set,
int* index,
gbdt_tree_t* gbdt_single_tree,
int nrnodes )
{
if(x_fea_value == NULL || y_gradient == NULL || data_set == NULL || index == NULL || gbdt_single_tree == NULL)
{
LOG_ERROR_("Parameter error.");
return -1;
}
splitinfo* spinf = (splitinfo*) malloc( sizeof(splitinfo));
if(spinf == NULL) {
LOG_ERROR_("Failed to allocate memory.");
return -1;
}
spinf->bestid = -1;
for (int i = 0; i < gbdt_inf.sample_num; ++i) index[i] = i;
int ncur = 0;
gbdt_single_tree->nodestatus[0] = GBDT_TOSPLIT;
gbdt_single_tree->ndstart[0] = 0;
gbdt_single_tree->ndcount[0] = gbdt_inf.sample_num;
gbdt_single_tree->depth[0] = 0;
/* compute mean and sum of squares for Y */
double avg = 0.0;
for (int i = 0; i < gbdt_inf.sample_num; ++i)
avg = (i * avg + y_gradient[index[i]]) / (i + 1);
gbdt_single_tree->ndavg[0] = avg;
if (gbdt_single_tree->ndcount[0] <= gbdt_inf.gbdt_min_node_size)
{
gbdt_single_tree->nodestatus[0] = GBDT_TERMINAL;
gbdt_single_tree->lson[0] = 0; // debug temp
gbdt_single_tree->rson[0] = 0;
gbdt_single_tree->splitid[0] = 0;
gbdt_single_tree->splitvalue[0] = 0.0;
gbdt_single_tree->nodesize = 1;
free(spinf);
return 0;
}
/* start main loop */
for (int k = 0; k < nrnodes - 2; ++k) {
if (k > ncur || ncur >= nrnodes - 2) {
break;
}
/* skip if the node is not to be split */
if (gbdt_single_tree->nodestatus[k] != GBDT_TOSPLIT) {
continue;
}
/* initialize for next call to findbestsplit */
nodeinfo ninf;
ninf.index_b = gbdt_single_tree->ndstart[k];
ninf.index_e = gbdt_single_tree->ndstart[k] + gbdt_single_tree->ndcount[k] - 1;
ninf.nodenum = gbdt_single_tree->ndcount[k];
ninf.nodesum = gbdt_single_tree->ndcount[k] * gbdt_single_tree->ndavg[k];
ninf.critparent = (ninf.nodesum * ninf.nodesum) / ninf.nodenum;
int jstat;
jstat = gbdt_tree_node_split(gbdt_inf, data_set, x_fea_value, y_gradient, ninf, index, spinf);
if (jstat == 1) //
{
/* Node is terminal: Mark it as such and move on to the next. */
gbdt_single_tree->nodestatus[k] = GBDT_TERMINAL;
continue;
}
if(jstat == -1)
{
free(spinf);
return -1;
}
gbdt_single_tree->splitid[k] = spinf->bestid;
gbdt_single_tree->splitvalue[k] = spinf->bestsplit;
gbdt_single_tree->nodestatus[k] = GBDT_INTERIOR;
/* leftnode no.= ncur+1, rightnode no. = ncur+2. */
gbdt_single_tree->ndstart[ncur + 1] = ninf.index_b;
gbdt_single_tree->ndstart[ncur + 2] = spinf->pivot;
gbdt_single_tree->ndcount[ncur + 1] = spinf->pivot - ninf.index_b; //
gbdt_single_tree->ndcount[ncur + 2] = ninf.index_e - spinf->pivot + 1;
gbdt_single_tree->depth[ncur + 1] = gbdt_single_tree->depth[k] + 1;
gbdt_single_tree->depth[ncur + 2] = gbdt_single_tree->depth[k] + 1;
/* compute mean and sum of squares for the left son node */
double avg = 0.0;
double d = 0.0;
int m = 0;
for (int j = ninf.index_b; j < spinf->pivot; ++j) // mean
{
d = y_gradient[index[j]];
m = j - ninf.index_b;
avg = (m * avg + d) / (m+1);
}
double var = 0.0;
for (int j = ninf.index_b; j < spinf->pivot; ++j)
{
var += (y_gradient[index[j]] - avg) * (y_gradient[index[j]] - avg);
}
var /= (spinf->pivot - ninf.index_b);
gbdt_single_tree->ndavg[ncur+1] = avg;
gbdt_single_tree->nodestatus[ncur+1] = GBDT_TOSPLIT;
if (gbdt_single_tree->ndcount[ncur + 1] <= gbdt_inf.gbdt_min_node_size)
{
gbdt_single_tree->no
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
gbdt.tar.gz (48个子文件)
gbdt
do_predict 52KB
gbdt_test.cpp 3KB
lib_gbdt
do_predict 55KB
gbdt_rabbit.cpp 5KB
gbdt.so 240KB
ori_input 104KB
gbdt_test.cpp 3KB
gradient_boosting.o 185KB
output
include
gradient_boosting.h 6KB
lib
libgbdt.a 189KB
test
gbdt.so 240KB
gbdt-test 156KB
gbdt-train 158KB
tags 8KB
gradient_boosting.cpp 37KB
test.result 5KB
gbdt_train.cpp 3KB
gbdt-test 156KB
gradient_boosting.h 6KB
gbdt_rabbit.o 154KB
run.sh 69B
gbdt-train 158KB
gbdt_test.o 51KB
test.model 108KB
libgbdt.a 189KB
train 78KB
makefile 2KB
input 246KB
do_gbdt 53KB
evaluate.py 795B
gbdt_train.o 53KB
test 26KB
tags 7KB
gradient_boosting.cpp 29KB
test.100 47KB
lib_gbdt.tar.gz 232KB
gbdt_train.cpp 2KB
transform.pl 282B
gradient_boosting.h 4KB
run.sh 73B
test.model 322KB
gdbt_train 53KB
auc 85KB
makefile 320B
gbdt_train 45KB
do_gbdt 53KB
gbdt_train.o 11KB
aaa 84.71MB
共 48 条
- 1
帛逸TB
- 粉丝: 194
- 资源: 3
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
- 1
- 2
- 3
- 4
- 5
- 6
前往页