kaggle竞赛实战5

接下来将两套方案得结果做数据拼接

读数据

train_dict = pd.read_csv(\ preprocess/train_dict.csv\ )
test_dict = pd.read_csv(\ preprocess/test_dict.csv\ )
train_groupby = pd.read_csv(\ preprocess/train_groupby.csv\ )
test_groupby = pd.read_csv(\ preprocess/test_groupby.csv\ )

删除重复项

for co in train_dict.columns:
if co in train_groupby.columns and co!='card_id':
del train_groupby[co]
for co in test_dict.columns:
if co in test_groupby.columns and co!='card_id':
del test_groupby[co]

拼接特征

train = pd.merge(train_dict, train_groupby, how='left', on='card_id').fillna(0)
test = pd.merge(test_dict, test_groupby, how='left', on='card_id').fillna(0)

数据保存与内存管理

train.to_csv(\ preprocess/train.csv\ , index=False)
test.to_csv(\ preprocess/test.csv\ , index=False)

del train_dict, test_dict, train_groupby, test_groupby
gc.collect()

开始建模部分！！！！！！！！！！！

先用随机森林

train = pd.read_csv("preprocess/train.csv")
test = pd.read_csv("preprocess/test.csv")

#提取特征名称

features = train.columns.tolist()
features.remove(\ card_id\ )
features.remove(\ target\ )
featureSelect = features[:]

#用person系数筛选，默认把这些都看成连续变量（只要不做独热编码就可以这么看）

for fea in featureSelect:
corr.append(abs(train[[fea, 'target']].fillna(0).corr().values[0][1])) #从内向外依次取出这两列，填充0，计算相关矩阵，然后只要取右上角那个值就可以

# 取top300的特征进行建模，具体数量可选
se = pd.Series(corr, index=featureSelect).sort_values(ascending=False) #以featureSelect为索引，按照值排序
feature_select = ['card_id'] + se[:300].index.tolist()

# 输出结果
train = train[feature_select + ['target']]
test = test[feature_select]

#用网格搜索进行参数调优

#导入包

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor #随机森林分类器
from sklearn.model_selection import GridSearchCV #网格搜索评估器

随机森林的参数有

我们挑选n_estimators\ 几棵树、\ min_samples_leaf\ 、\ min_samples_split\ 、\ max_depth\ 和\ max_features\ 每次进行特征选择时所纳入考虑的最大特征数这几个进行参数搜索

调参思路：用halving方法,其思路为凉凉比对，然后逐层筛选参数组合

features = train.columns.tolist()
features.remove("card_id")
features.remove("target")

parameter_space = {
" n_estimators" : [79, 80, 81],
” min_samples_leaf”: [29, 30, 31],
"min_samples_split“: [2, 3],
"max_depth” : [9, 10],
"max_features“: [\ auto\ , 80]
}

#构建参数评估其，然后输入其它参数取值

clf = RandomForestRegressor(
criterion="mse" ,
n_jobs=15,
random_state=22)

#进行网格搜索

grid = GridSearchCV(clf, parameter_space, cv=2, scoring="neg_mean_squared_error")
grid.fit(train[features].values, train['target'].values)

#拿最佳参数

grid.best_params_

#查看最优参数评估器

grid.best_estimator_

#查看在训练集上的最终评分

np.sqrt(-grid.best_score_)

目前评分3.69样子

#然后预测测试集结果并写入csv