2. 数据处理¶
In [ ]:
Copied!
import os
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import dtreeviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import os
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import dtreeviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
In [2]:
Copied!
# load titanic dataset from lightgbm
dataset = pd.read_csv("../uncommitted/yandi_bigdata_vf_guest_user_sample_tree6", sep='\t')
dataset.head()
# load titanic dataset from lightgbm
dataset = pd.read_csv("../uncommitted/yandi_bigdata_vf_guest_user_sample_tree6", sep='\t')
dataset.head()
Out[2]:
uid | source3 | city | expo_time | device | dt | pt | |
---|---|---|---|---|---|---|---|
0 | 1008792189321 | 自然点击 | 北京 | 11 | apple | working6 | 0 |
1 | 1008901220 | 厂商_内容合作 | 十堰 | 9 | apple | working6 | 0 |
2 | 1009047004691 | 自然点击 | 温州 | 0 | huawei | working6 | 0 |
3 | 1009146331245 | 自然点击 | 宝鸡 | 14 | huawei | working6 | 0 |
4 | 1009433523553 | 自然点击 | 郑州 | 13 | huawei | working6 | 0 |
数据含义:
pt
:是否留存expo_time
:曝光时刻的hour,取值0-24的整数source3
: 投放的渠道商device
: 用户手机类型city
: 所在城市
对数据进行数值化处理,将字符串类型的数据转换为数值类型,方便后续建模。类似于sklearn.preprocessing.LabelEncoder
,在pandas
中使用factorize
方法。
In [3]:
Copied!
feature_names = {}
category_columns = ["source3", "city", "device"]
for column in category_columns:
dataset[column], feature_name = dataset[column].factorize()
feature_names[column] = feature_name
int_columns = []
for column in int_columns:
dataset[column].fillna(-1, inplace=True)
print(dataset[category_columns].head(10))
print(feature_names)
feature_names = {}
category_columns = ["source3", "city", "device"]
for column in category_columns:
dataset[column], feature_name = dataset[column].factorize()
feature_names[column] = feature_name
int_columns = []
for column in int_columns:
dataset[column].fillna(-1, inplace=True)
print(dataset[category_columns].head(10))
print(feature_names)
source3 city device 0 0 0 0 1 1 1 0 2 0 2 1 3 0 3 1 4 0 4 1 5 0 5 2 6 0 6 0 7 0 7 2 8 0 8 1 9 0 9 2 {'source3': Index(['自然点击', '厂商_内容合作', '合作PUSH', '社交', '兴趣', '开放平台', '厂商', '游戏', '运营商', '大型APP_合作', '程序化_人工', '线上第三方', '运营热点', '消息互动', '游戏SDK', '程序化', '未报备', '预装PUSH'], dtype='object'), 'city': Index(['北京', '十堰', '温州', '宝鸡', '郑州', '揭阳', '济南', '汕头', '海口', '福州', ... '甘南', '阿坝', '阿拉善盟', '阿里', '济源', '黄南', '海南', '潜江', '阿拉尔', '五家渠'], dtype='object', length=352), 'device': Index(['apple', 'huawei', 'oppo', 'xiaomi', 'vivo', 'blackshark', 'doov', 'samsung', 'realme', 'honor', ... 'ibasso', 'caltta technologies', 'ysd', 'zhishifangzhou', 'acer', 'saifl', 'hytera', 'intel', 'z7s', 'wiz'], dtype='object', length=744)}
词典列表很长,由于是数据分析向,只分析头部值就可以了
In [4]:
Copied!
# keep topk indices of devices
index = dataset.device.value_counts().iloc[:20].index
dataset["device"] = dataset.device.apply(lambda x: x if x in index else -1)
# keep topk indices of devices
index = dataset.device.value_counts().iloc[:20].index
dataset["device"] = dataset.device.apply(lambda x: x if x in index else -1)
In [5]:
Copied!
index = dataset.city.value_counts().iloc[:20].index
dataset["city"] = dataset.city.apply(lambda x: x if x in index else -1)
index = dataset.city.value_counts().iloc[:20].index
dataset["city"] = dataset.city.apply(lambda x: x if x in index else -1)
In [6]:
Copied!
category_columns = ["source3", "device", "city"]
int_columns = ["expo_time",]
target_column = "pt"
feature_columns = category_columns + int_columns
dataset = dataset[feature_columns + [target_column]]
category_columns = ["source3", "device", "city"]
int_columns = ["expo_time",]
target_column = "pt"
feature_columns = category_columns + int_columns
dataset = dataset[feature_columns + [target_column]]
In [7]:
Copied!
dataset.pt.value_counts()
dataset.pt.value_counts()
Out[7]:
0 7138497 1 5509182 Name: pt, dtype: int64
In [8]:
Copied!
dataset = dataset.dropna()
dataset = dataset.dropna()
In [9]:
Copied!
# dataset = dataset.astype({target_column: "int"})
# for column in category_columns:
# dataset = dataset.astype({column: "category"})
# dataset = dataset.astype({target_column: "int"})
# for column in category_columns:
# dataset = dataset.astype({column: "category"})
In [10]:
Copied!
f_city = pd.DataFrame(dataset.city.value_counts())
f_city["city_name"] = f_city.index.map(lambda x: feature_names["city"][x] if x!=-1 else "others")
f_city
f_city = pd.DataFrame(dataset.city.value_counts())
f_city["city_name"] = f_city.index.map(lambda x: feature_names["city"][x] if x!=-1 else "others")
f_city
Out[10]:
city | city_name | |
---|---|---|
-1 | 7691491 | others |
0 | 1312280 | 北京 |
40 | 341369 | 上海 |
25 | 311106 | 重庆 |
65 | 305374 | 广州 |
41 | 284482 | 成都 |
77 | 229453 | 武汉 |
4 | 202350 | 郑州 |
48 | 196921 | 深圳 |
62 | 190219 | 西安 |
33 | 185553 | 南京 |
43 | 170546 | 石家庄 |
17 | 166554 | 杭州 |
19 | 163369 | 天津 |
6 | 159692 | 济南 |
55 | 134339 | 长沙 |
84 | 127452 | 青岛 |
15 | 125977 | 济宁 |
38 | 120150 | 临沂 |
101 | 118926 | 苏州 |
99 | 110076 | 合肥 |
In [11]:
Copied!
f_device = pd.DataFrame(dataset.device.value_counts())
f_device["device_name"] = f_device.index.map(lambda x: feature_names["device"][x] if x!=-1 else "others")
f_device
f_device = pd.DataFrame(dataset.device.value_counts())
f_device["device_name"] = f_device.index.map(lambda x: feature_names["device"][x] if x!=-1 else "others")
f_device
Out[11]:
device | device_name | |
---|---|---|
0 | 3862458 | apple |
1 | 2593778 | huawei |
4 | 2004773 | vivo |
3 | 1532903 | xiaomi |
2 | 1285823 | oppo |
9 | 656446 | honor |
8 | 181392 | realme |
7 | 134811 | samsung |
10 | 130585 | oneplus |
-1 | 63268 | others |
12 | 44254 | ptac |
21 | 39576 | lenovo |
20 | 25974 | meizu |
16 | 21770 | cmdc |
22 | 15819 | tianyi |
14 | 11047 | motorola |
13 | 10204 | liantong |
5 | 9562 | blackshark |
27 | 7794 | tdtech |
15 | 7735 | zte |
18 | 7707 | nubia |
In [12]:
Copied!
f_source3 = pd.DataFrame(dataset.source3.value_counts())
f_source3["source3_name"] = f_source3.index.map(lambda x: feature_names["source3"][x] if x!=-1 else "others")
f_source3
f_source3 = pd.DataFrame(dataset.source3.value_counts())
f_source3["source3_name"] = f_source3.index.map(lambda x: feature_names["source3"][x] if x!=-1 else "others")
f_source3
Out[12]:
source3 | source3_name | |
---|---|---|
0 | 11370017 | 自然点击 |
1 | 406083 | 厂商_内容合作 |
9 | 199252 | 大型APP_合作 |
2 | 158372 | 合作PUSH |
7 | 141909 | 游戏 |
5 | 112316 | 开放平台 |
3 | 56852 | 社交 |
8 | 49020 | 运营商 |
11 | 48090 | 线上第三方 |
4 | 41754 | 兴趣 |
12 | 26107 | 运营热点 |
13 | 10335 | 消息互动 |
10 | 9575 | 程序化_人工 |
14 | 7301 | 游戏SDK |
15 | 4420 | 程序化 |
6 | 3576 | 厂商 |
-1 | 2233 | others |
16 | 464 | 未报备 |
17 | 3 | 预装PUSH |
3. 建立决策树¶
In [13]:
Copied!
data = lgb.Dataset(dataset[feature_columns],
label=dataset[target_column],
feature_name=feature_columns,
categorical_feature=category_columns,
free_raw_data=False
)
data = lgb.Dataset(dataset[feature_columns],
label=dataset[target_column],
feature_name=feature_columns,
categorical_feature=category_columns,
free_raw_data=False
)
In [14]:
Copied!
param = {
'boosting_type': 'dart',
'objective': 'binary',
'metric': ['binary_logloss', "auc"],
'num_leaves': 10,
'learning_rate': 0.5,
'feature_fraction': 0.8,
'max_depth':4,
'min_data_in_leaf': 3000,
"min_data_per_group": 1000,
'verbose': 0,
}
param = {
'boosting_type': 'dart',
'objective': 'binary',
'metric': ['binary_logloss', "auc"],
'num_leaves': 10,
'learning_rate': 0.5,
'feature_fraction': 0.8,
'max_depth':4,
'min_data_in_leaf': 3000,
"min_data_per_group": 1000,
'verbose': 0,
}
In [15]:
Copied!
num_round = 10
num_round = 10
cv
交叉验证已经内置了,都不用自己划分,可以用来调参,但是由于数据量较小,这里直接用默认参数建立决策树。
In [16]:
Copied!
lgb.cv(param, data, num_round, nfold=5)
lgb.cv(param, data, num_round, nfold=5)
/data0/users/yandi/project/user_dim_reduction/.venv/lib/python3.7/site-packages/lightgbm/basic.py:2065: UserWarning: Using categorical_feature in Dataset.
[LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039645 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039653 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039339 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044530 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040019 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`.
Out[16]:
{'binary_logloss-mean': [0.6611806137789256, 0.6483566573409874, 0.6404597771659992, 0.6385914070333497, 0.6367117987529956, 0.6359197156939506, 0.6356236439452655, 0.636605771025251, 0.6364385653294171, 0.6363157134427969], 'binary_logloss-stdv': [5.229218689134398e-05, 5.5011081201943883e-05, 8.224705563347412e-05, 9.837604266465201e-05, 0.00010706421224167403, 0.00016609106057672847, 0.00015859042212779795, 0.00013722531248463153, 0.00012994185470165104, 0.0001221683993274359], 'auc-mean': [0.640303475177552, 0.6608994915882134, 0.666050028475772, 0.6672431281779567, 0.6680580318063558, 0.6682714793676438, 0.6687496142214466, 0.668521727769987, 0.668657710774117, 0.6688400349973666], 'auc-stdv': [0.00022117752433973458, 0.0001829753917785809, 0.00020077043000291448, 0.00025607071453552505, 0.00024811254813978555, 0.0002578193120040691, 0.00031748834012006644, 0.00027790596220326043, 0.00026362820616253444, 0.00019591119864766687]}
In [17]:
Copied!
train_index, valid_index = train_test_split(dataset.index, test_size=0.2, random_state=42)
train_index, valid_index = train_test_split(dataset.index, test_size=0.2, random_state=42)
In [18]:
Copied!
train_data = lgb.Dataset(dataset.loc[train_index][feature_columns],
label=dataset.loc[train_index][target_column],
feature_name=feature_columns,
categorical_feature=category_columns,
free_raw_data=False
)
valid_data = lgb.Dataset(dataset.loc[valid_index][feature_columns],
label=dataset.loc[valid_index][target_column],
feature_name=feature_columns,
categorical_feature=category_columns,
free_raw_data=False
).construct()
train_data = lgb.Dataset(dataset.loc[train_index][feature_columns],
label=dataset.loc[train_index][target_column],
feature_name=feature_columns,
categorical_feature=category_columns,
free_raw_data=False
)
valid_data = lgb.Dataset(dataset.loc[valid_index][feature_columns],
label=dataset.loc[valid_index][target_column],
feature_name=feature_columns,
categorical_feature=category_columns,
free_raw_data=False
).construct()
[LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN
In [19]:
Copied!
gbm = lgb.train(param, train_data, num_round)
gbm = lgb.train(param, train_data, num_round)
/data0/users/yandi/project/user_dim_reduction/.venv/lib/python3.7/site-packages/lightgbm/basic.py:2065: UserWarning: Using categorical_feature in Dataset.
[LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040152 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`.
In [21]:
Copied!
# 可视化第一棵树
for i in range(3):
fig, ax = plt.subplots(figsize=(30, 30))
lgb.plot_tree(gbm, tree_index=i, ax=ax, orientation="horizontal", show_info=["split_gain", "internal_count", "data_percentage", "leaf_count"])
plt.show()
# 可视化第一棵树
for i in range(3):
fig, ax = plt.subplots(figsize=(30, 30))
lgb.plot_tree(gbm, tree_index=i, ax=ax, orientation="horizontal", show_info=["split_gain", "internal_count", "data_percentage", "leaf_count"])
plt.show()
In [22]:
Copied!
sub_dataset = dataset.sample(10000)
viz_model = dtreeviz.model(
gbm,
sub_dataset[feature_columns],
sub_dataset[target_column],
tree_index = 0,
target_name=target_column,
feature_names=feature_columns,
class_names=["Dead", "Survived"],
)
sub_dataset = dataset.sample(10000)
viz_model = dtreeviz.model(
gbm,
sub_dataset[feature_columns],
sub_dataset[target_column],
tree_index = 0,
target_name=target_column,
feature_names=feature_columns,
class_names=["Dead", "Survived"],
)
In [23]:
Copied!
viz_model.view(orientation="LR")
viz_model.view(orientation="LR")
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans. findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans. findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans. findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans. findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
In [27]:
Copied!
y_pred = gbm.predict(dataset.loc[valid_index][feature_columns])
confusion_matrix(y_pred>0.5, dataset.loc[valid_index][target_column])
y_pred = gbm.predict(dataset.loc[valid_index][feature_columns])
confusion_matrix(y_pred>0.5, dataset.loc[valid_index][target_column])
Out[27]:
array([[1178483, 688089], [ 248999, 413965]])
In [28]:
Copied!
print(classification_report(dataset.loc[valid_index][target_column], y_pred>0.5))
print(classification_report(dataset.loc[valid_index][target_column], y_pred>0.5))
precision recall f1-score support 0 0.63 0.83 0.72 1427482 1 0.62 0.38 0.47 1102054 accuracy 0.63 2529536 macro avg 0.63 0.60 0.59 2529536 weighted avg 0.63 0.63 0.61 2529536
4. 模型解释¶
本小节主要是对模型的解释,可以看到expo_time
和source3
是最重要的特征,device
和city
的重要性较低。但是还不够,希望知道特征组合,是怎么影响留存的。
这里的想法是,GBDT的根节点的值如果为正数,说明这个特征对留存有正向影响,如果为负数,说明这个特征对留存有负向影响。
但是如果去看路径,就会比较迷惑,都是特征和阈值的组合,不好解释。这里的方法是,分析根节点中特征是怎么组合的(统计特征值出现的占比),就可以得到特征组合的影响。
In [29]:
Copied!
def describe_cat(df, target_column, feature_names=None, topK=5, min_ratio=0.01):
""" 对指定数据集进行人群画像描述
例如,给定一个df,包含了用户的年龄、性别、收入、消费能力等特征,可以通过Descriptor来描述这个人群的特征
Args:
df: 数据集
target_column: 目标列
feature_names: 特征名称
topK: 取topK个类别
ratio: 取占比大于ratio的类别
Returns:
feat: 人群画像
"""
# 计算各个类别的占比
f = pd.DataFrame({"cnt": df[target_column].value_counts()})
if feature_names is not None:
f["name"] = f.index.map(lambda x: feature_names[x] if x!=-1 else "others")
f["ratio"] = f["cnt"] / f["cnt"].sum()
f = f.sort_values("ratio", ascending=False)
f = f[f.ratio > min_ratio].iloc[:topK]
# concat index with name, ratio
if feature_names is not None:
f = f.index.astype(str) + "@" + f["name"] + "@" + f["ratio"].round(2).astype(str)
else:
f = f.index.astype(str) + "@" + f["ratio"].round(2).astype(str)
f = f.str.cat(sep="|")
return f
def describe_int(df, target_column):
""" 对指定数据集进行人群画像描述
例如,给定一个df,包含了用户的年龄、性别、收入、消费能力等特征,可以通过Descriptor来描述这个人群的特征
Args:
df: 数据集
target_column: 目标列
Returns:
feat: 人群画像
"""
# 计算数值型特征的分布
f = df[target_column].describe()
f = f.index.astype(str) + "@" + f.round(2).astype(str)
f = f.str.cat(sep="|")
return f
def describe_cat(df, target_column, feature_names=None, topK=5, min_ratio=0.01):
""" 对指定数据集进行人群画像描述
例如,给定一个df,包含了用户的年龄、性别、收入、消费能力等特征,可以通过Descriptor来描述这个人群的特征
Args:
df: 数据集
target_column: 目标列
feature_names: 特征名称
topK: 取topK个类别
ratio: 取占比大于ratio的类别
Returns:
feat: 人群画像
"""
# 计算各个类别的占比
f = pd.DataFrame({"cnt": df[target_column].value_counts()})
if feature_names is not None:
f["name"] = f.index.map(lambda x: feature_names[x] if x!=-1 else "others")
f["ratio"] = f["cnt"] / f["cnt"].sum()
f = f.sort_values("ratio", ascending=False)
f = f[f.ratio > min_ratio].iloc[:topK]
# concat index with name, ratio
if feature_names is not None:
f = f.index.astype(str) + "@" + f["name"] + "@" + f["ratio"].round(2).astype(str)
else:
f = f.index.astype(str) + "@" + f["ratio"].round(2).astype(str)
f = f.str.cat(sep="|")
return f
def describe_int(df, target_column):
""" 对指定数据集进行人群画像描述
例如,给定一个df,包含了用户的年龄、性别、收入、消费能力等特征,可以通过Descriptor来描述这个人群的特征
Args:
df: 数据集
target_column: 目标列
Returns:
feat: 人群画像
"""
# 计算数值型特征的分布
f = df[target_column].describe()
f = f.index.astype(str) + "@" + f.round(2).astype(str)
f = f.str.cat(sep="|")
return f
In [30]:
Copied!
# construct path to leaf from tree
def get_path_to_leaf(tree, tree_index, leaf_index):
""" get path to leaf from tree
Args:
tree: tree dataframe
tree_index: tree index
leaf_index: leaf index
Returns:
path: path to leaf
"""
path = []
node_index = leaf_index
while True:
row = tree[(tree.tree_index==tree_index) & (tree.node_index==node_index)]
if len(row) == 0:
break
path.append(node_index)
node_index = row.parent_index.values[0]
path = path[::-1]
return path
# construct decision path from tree and path
def get_decision_path(tree, path):
""" get decision path from tree and path
Args:
tree: tree dataframe
path: path to leaf
Returns:
path: decision path
"""
decisions = []
for i in range(0, len(path) - 1):
parent, child = path[i: i+2]
row = tree[tree.node_index==parent]
if row.left_child.values[0] == child:
direction = 1
else:
direction = 0
split_reason = row.split_feature.values[0] \
+ row.decision_type.values[0] \
+ str(row.threshold.values[0]) \
+ ":" + str(direction)
decisions.append(split_reason)
return decisions
# get tree stat, order by value_norm
def get_tree_stat(gbm, tree_index):
tree = gbm.trees_to_dataframe()
tree = tree[tree.tree_index == tree_index]
tree["path"] = tree["node_index"].apply(lambda x: ",".join(get_path_to_leaf(tree, tree_index, x)))
tree["decision"] = tree["path"].apply(lambda x: ",".join(get_decision_path(tree, x.split(","))))
tree["node"] = tree["node_index"].apply(lambda x: int(x.split("-L")[1]) if "-L" in x else -1)
tree = tree[tree["node"] > -1]
tree["value_norm"] = tree.value - tree.value.mean()
tree = tree.sort_values("value_norm", ascending=False)
tree.set_index("node", inplace=True)
keep_cols = ["tree_index", "node_index", "value", "path", "decision", "value_norm"]
tree = tree[keep_cols]
return tree
# construct path to leaf from tree
def get_path_to_leaf(tree, tree_index, leaf_index):
""" get path to leaf from tree
Args:
tree: tree dataframe
tree_index: tree index
leaf_index: leaf index
Returns:
path: path to leaf
"""
path = []
node_index = leaf_index
while True:
row = tree[(tree.tree_index==tree_index) & (tree.node_index==node_index)]
if len(row) == 0:
break
path.append(node_index)
node_index = row.parent_index.values[0]
path = path[::-1]
return path
# construct decision path from tree and path
def get_decision_path(tree, path):
""" get decision path from tree and path
Args:
tree: tree dataframe
path: path to leaf
Returns:
path: decision path
"""
decisions = []
for i in range(0, len(path) - 1):
parent, child = path[i: i+2]
row = tree[tree.node_index==parent]
if row.left_child.values[0] == child:
direction = 1
else:
direction = 0
split_reason = row.split_feature.values[0] \
+ row.decision_type.values[0] \
+ str(row.threshold.values[0]) \
+ ":" + str(direction)
decisions.append(split_reason)
return decisions
# get tree stat, order by value_norm
def get_tree_stat(gbm, tree_index):
tree = gbm.trees_to_dataframe()
tree = tree[tree.tree_index == tree_index]
tree["path"] = tree["node_index"].apply(lambda x: ",".join(get_path_to_leaf(tree, tree_index, x)))
tree["decision"] = tree["path"].apply(lambda x: ",".join(get_decision_path(tree, x.split(","))))
tree["node"] = tree["node_index"].apply(lambda x: int(x.split("-L")[1]) if "-L" in x else -1)
tree = tree[tree["node"] > -1]
tree["value_norm"] = tree.value - tree.value.mean()
tree = tree.sort_values("value_norm", ascending=False)
tree.set_index("node", inplace=True)
keep_cols = ["tree_index", "node_index", "value", "path", "decision", "value_norm"]
tree = tree[keep_cols]
return tree
In [32]:
Copied!
sample = valid_data.data.copy()
sample["pt"] = valid_data.get_label()
sample = valid_data.data.copy()
sample["pt"] = valid_data.get_label()
In [33]:
Copied!
# 获得每行数据对应的叶子节点编号,例如,五棵树就是,[29, 2, 8, 26, 2]
embeddings = gbm.predict(valid_data.get_data(), pred_leaf=True)
# 获得每行数据对应的叶子节点编号,例如,五棵树就是,[29, 2, 8, 26, 2]
embeddings = gbm.predict(valid_data.get_data(), pred_leaf=True)
In [34]:
Copied!
tree_index = [0, 1]
tree_index = [0, 1]
In [35]:
Copied!
leaf_cnt = pd.DataFrame({"cnt": pd.DataFrame(embeddings[:,tree_index]).value_counts()})
leaf_cnt = pd.DataFrame({"cnt": pd.DataFrame(embeddings[:,tree_index]).value_counts()})
In [36]:
Copied!
tree_stat = {}
for i in tree_index:
tree_stat[i] = get_tree_stat(gbm, i)
tree_stat[0]
tree_stat = {}
for i in tree_index:
tree_stat[i] = get_tree_stat(gbm, i)
tree_stat[0]
Out[36]:
tree_index | node_index | value | path | decision | value_norm | |
---|---|---|---|---|---|---|
node | ||||||
0 | 0 | 0-L0 | 0.278742 | 0-S0,0-S2,0-S3,0-L0 | expo_time<=8.500000000000002:1,device==0||1||3... | 0.455888 |
5 | 0 | 0-L5 | 0.151277 | 0-S0,0-S2,0-S3,0-S4,0-L5 | expo_time<=8.500000000000002:1,device==0||1||3... | 0.328423 |
3 | 0 | 0-L3 | 0.003205 | 0-S0,0-S2,0-S6,0-L3 | expo_time<=8.500000000000002:1,device==0||1||3... | 0.180352 |
4 | 0 | 0-L4 | -0.094405 | 0-S0,0-S2,0-S3,0-S4,0-L4 | expo_time<=8.500000000000002:1,device==0||1||3... | 0.082742 |
1 | 0 | 0-L1 | -0.115240 | 0-S0,0-S1,0-S5,0-L1 | expo_time<=8.500000000000002:0,device==0||1||3... | 0.061906 |
6 | 0 | 0-L6 | -0.221240 | 0-S0,0-S1,0-S5,0-S7,0-L6 | expo_time<=8.500000000000002:0,device==0||1||3... | -0.044093 |
7 | 0 | 0-L7 | -0.318670 | 0-S0,0-S2,0-S6,0-L7 | expo_time<=8.500000000000002:1,device==0||1||3... | -0.141524 |
8 | 0 | 0-L8 | -0.337693 | 0-S0,0-S1,0-S5,0-S7,0-L8 | expo_time<=8.500000000000002:0,device==0||1||3... | -0.160546 |
9 | 0 | 0-L9 | -0.495727 | 0-S0,0-S1,0-S8,0-L9 | expo_time<=8.500000000000002:0,device==0||1||3... | -0.318580 |
2 | 0 | 0-L2 | -0.621714 | 0-S0,0-S1,0-S8,0-L2 | expo_time<=8.500000000000002:0,device==0||1||3... | -0.444568 |
这个表可以告诉我们,第一棵树的所有根节点,节点路径,及对应的提升值是负还是正(提升值是绝对值减去样本的均值)。接下来统计,每个节点上的特征值出现的占比。
In [37]:
Copied!
from collections import defaultdict
feature_stat = defaultdict(dict)
for i_node, cnt in leaf_cnt.iterrows():
subsample = sample[(embeddings[:, tree_index] == i_node).all(axis=1)]
feature_stat[i_node] = {
"tree_index": tree_index,
"node": i_node,
"path": "@@@".join([tree_stat[i].loc[j]["path"] for i, j in enumerate(i_node)]),
"decision": "@@@".join([tree_stat[i].loc[j]["decision"] for i, j in enumerate(i_node)]),
"value_norm": round(sum(tree_stat[i].loc[j]["value_norm"] for i, j in enumerate(i_node)), 2),
"value": round(sum(tree_stat[i].loc[j]["value"] for i, j in enumerate(i_node)), 2),
"cnt": leaf_cnt["cnt"][i_node],
"ratio": (leaf_cnt["cnt"][i_node] / leaf_cnt["cnt"].sum()).round(2),
"device": describe_cat(subsample, "device", feature_names=feature_names["device"]),
"city": describe_cat(subsample, "city", feature_names=feature_names["city"]),
"expo_time": describe_cat(subsample, "expo_time"),
"expo_time_sta": describe_int(subsample, "expo_time"),
"pt_cat": describe_cat(subsample, "pt"),
"source3": describe_cat(subsample, "source3", feature_names=feature_names["source3"]),
}
feature_stat
from collections import defaultdict
feature_stat = defaultdict(dict)
for i_node, cnt in leaf_cnt.iterrows():
subsample = sample[(embeddings[:, tree_index] == i_node).all(axis=1)]
feature_stat[i_node] = {
"tree_index": tree_index,
"node": i_node,
"path": "@@@".join([tree_stat[i].loc[j]["path"] for i, j in enumerate(i_node)]),
"decision": "@@@".join([tree_stat[i].loc[j]["decision"] for i, j in enumerate(i_node)]),
"value_norm": round(sum(tree_stat[i].loc[j]["value_norm"] for i, j in enumerate(i_node)), 2),
"value": round(sum(tree_stat[i].loc[j]["value"] for i, j in enumerate(i_node)), 2),
"cnt": leaf_cnt["cnt"][i_node],
"ratio": (leaf_cnt["cnt"][i_node] / leaf_cnt["cnt"].sum()).round(2),
"device": describe_cat(subsample, "device", feature_names=feature_names["device"]),
"city": describe_cat(subsample, "city", feature_names=feature_names["city"]),
"expo_time": describe_cat(subsample, "expo_time"),
"expo_time_sta": describe_int(subsample, "expo_time"),
"pt_cat": describe_cat(subsample, "pt"),
"source3": describe_cat(subsample, "source3", feature_names=feature_names["source3"]),
}
feature_stat
Out[37]:
defaultdict(dict, {(8, 3): {'tree_index': [0, 1], 'node': (8, 3), 'path': '0-S0,0-S1,0-S5,0-S7,0-L8@@@1-S0,1-S1,1-S2,1-S4,1-L3', 'decision': 'expo_time<=8.500000000000002:0,device==0||1||3||7||20:1,expo_time<=10.500000000000002:0,device==0:0@@@source3==1||2||5||6||7||9||11||14||16:0,device==4||5||14||18||21:0,device==2||7||8||10||15||16||22||27:0,source3==0||10:1', 'value_norm': 0.03, 'value': -0.28, 'cnt': 449657, 'ratio': 0.18, 'device': '1@huawei@0.64|3@xiaomi@0.35', 'city': '-1@others@0.61|0@北京@0.09|25@重庆@0.03|40@上海@0.03|65@广州@0.02', 'expo_time': '22@0.09|21@0.08|12@0.08|11@0.08|23@0.08', 'expo_time_sta': 'count@449657.0|mean@17.08|std@3.81|min@11.0|25%@14.0|50%@17.0|75%@20.0|max@23.0', 'pt_cat': '0.0@0.58|1.0@0.42', 'source3': '0@自然点击@1.0'}, (6, 3): {'tree_index': [0, 1], 'node': (6, 3), 'path': '0-S0,0-S1,0-S5,0-S7,0-L6@@@1-S0,1-S1,1-S2,1-S4,1-L3', 'decision': 'expo_time<=8.500000000000002:0,device==0||1||3||7||20:1,expo_time<=10.500000000000002:0,device==0:1@@@source3==1||2||5||6||7||9||11||14||16:0,device==4||5||14||18||21:0,device==2||7||8||10||15||16||22||27:0,source3==0||10:1', 'value_norm': 0.14, 'value': -0.16, 'cnt': 446155, 'ratio': 0.18, 'device': '0@apple@1.0', 'city': '-1@others@0.58|0@北京@0.1|40@上海@0.04|41@成都@0.03|65@广州@0.03', 'expo_time': '22@0.09|23@0.09|21@0.08|12@0.08|11@0.08', 'expo_time_sta': 'count@446155.0|mean@17.13|std@3.84|min@11.0|25%@14.0|50%@17.0|75%@21.0|max@23.0', 'pt_cat': '0.0@0.55|1.0@0.45', 'source3': '0@自然点击@1.0'}, (2, 5): {'tree_index': [0, 1], 'node': (2, 5), 'path': '0-S0,0-S1,0-S8,0-L2@@@1-S0,1-S1,1-S2,1-S4,1-L5', 'decision': 'expo_time<=8.500000000000002:0,device==0||1||3||7||20:0,device==4||5||13||15||16||18||22||27:1@@@source3==1||2||5||6||7||9||11||14||16:0,device==4||5||14||18||21:0,device==2||7||8||10||15||16||22||27:0,source3==0||10:0', 'value_norm': -0.07, 'value': -0.37, 'cnt': 17, 'ratio': 0.0, 'device': '13@liantong@1.0', 'city': '-1@others@0.76|0@北京@0.12|4@郑州@0.06|65@广州@0.06', 'expo_time': '19@0.35|18@0.18|14@0.12|13@0.12|10@0.06', 'expo_time_sta': 'count@17.0|mean@16.12|std@3.69|min@9.0|25%@13.0|50%@18.0|75%@19.0|max@21.0', 'pt_cat': '0.0@0.65|1.0@0.35', 'source3': '8@运营商@0.41|12@运营热点@0.29|4@兴趣@0.24|3@社交@0.06'}})
In [38]:
Copied!
df = pd.DataFrame(feature_stat).T
df = df.sort_values("value", ascending=False)
df = pd.DataFrame(feature_stat).T
df = df.sort_values("value", ascending=False)
In [39]:
Copied!
df.to_csv("../uncommitted/feature_stat_yandi.tsv", index=False, sep="\t")
df.to_csv("../uncommitted/feature_stat_yandi.tsv", index=False, sep="\t")