admin管理员组文章数量:1650971
2019独角兽企业重金招聘Python工程师标准>>>
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
import xgboost as xgb
class Multi_column_encoder:
def __init__(self, df, column_name_list):
self.df = df
self.column_name_list = column_name_list
def column_encoder(self):
Enc_ohe, Enc_label = OneHotEncoder(), LabelEncoder()
for column_name in self.column_name_list:
self.df["Dummies"] = Enc_label.fit_transform(self.df[column_name])
self.df_dummies = pd.DataFrame(Enc_ohe.fit_transform(self.df[["Dummies"]]).todense(), columns = Enc_label.classes_)
self.df_dummies.rename(columns=lambda x: column_name + "_" + x, inplace=True)
self.df = pd.concat([self.df, self.df_dummies], axis=1)
self.df.drop(["Dummies"], axis=1, inplace=True)
self.df.drop(self.column_name_list, axis=1, inplace=True)
return self.df
data = pd.read_csv("D:/data/affairs.csv")
column_name_list = ['gender','children']
df = Multi_column_encoder(data,column_name_list).column_encoder()
df.rename(columns={'affairs':'target'}, inplace = True)
df['target']=df['target'].apply(lambda x: x if x ==0 else 1)
Y = df.target
X = df.drop('target',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=0)
clf = xgb.XGBClassifier()
clf.fit(X_train,y_train)
preds = clf.predict_proba(X_test)
######################
######14,交际小于10
data1 = pd.read_csv("D:/data/new_valid_info.csv")
df1 = Multi_column_encoder(data1,column_name_list).column_encoder()
"""
len(df1.columns)
Out[13]: 14
df1.dtypes
Out[14]:
age int64
yearsmarried float64
religiousness int64
education int64
occupation int64
rating int64
gender_boy float64
gender_female float64
gender_girl float64
gender_male float64
children_man float64
children_no float64
children_women float64
children_yes float64
dtype: object
"""
train_list = list(X_train.columns) ###个数10
"""
train_list
Out[17]:
['age',
'yearsmarried',
'religiousness',
'education',
'occupation',
'rating',
'gender_female',
'gender_male',
'children_no',
'children_yes']
len(train_list)
Out[18]: 10
"""
df1_list = list(df1.columns) ##个数14
"""
df1_list
Out[20]:
['age',
'yearsmarried',
'religiousness',
'education',
'occupation',
'rating',
'gender_boy',
'gender_female',
'gender_girl',
'gender_male',
'children_man',
'children_no',
'children_women',
'children_yes']
len(df1_list)
Out[21]: 14
"""
##train-column与df1-column取交集为10,特征数等于训练集,此种情况可以使用df1[X_train.columns]
a = list(set(train_list).intersection(set(df1_list)))
df11 = df1[X_train.columns]
preds_11 = clf.predict_proba(df11)
"""
a
Out[23]:
['rating',
'gender_female',
'children_no',
'religiousness',
'education',
'children_yes',
'yearsmarried',
'age',
'gender_male',
'occupation']
len(a)
Out[24]: 10
"""
###################X_train.columns需要转换为f_names = clf.feature_names这种形式,否则再加载模型时候没有X_train就不能操作了
###再想办法与批量One-hot的类进行整合,后续再统一改为scala操作
train_column_list = list(X_train.columns)
pred_column_list = list(df.columns)
inter_list = list(set(train_column_list).intersection(set(pred_column_list)))
if len(inter_list) == len(train_column_list):
df = df[X_train.columns]
return df
else if len(inter_list) == len(train_column_list):
df = df[inter_list]
df = df.reindex(columns=train_column_list, fill_value=0)
return df
####
data3 = pd.read_csv("D:/data/valid_data.csv")
df3 = Multi_column_encoder(data2,column_name_list).column_encoder()
df3_list = list(df3.columns)
###train-column与df3-column取交集为9,特征数小于训练集
c = list(set(train_list).intersection(set(df3_list)))
df33 = df3[c]
###插入列后,字段数与训练集保持一致,且用此方式字段重排后,顺序也与训练集一致
df33 = df33.reindex(columns=train_list, fill_value=0) ##参考资料https://wwwblogs/rrttp/p/8108188.html
##然后再进行预测
preds_33 = clf.predict_proba(df33)
####################
以上两种情况进行函数封装
加载模型时,导入最佳迭代次数
model.best_ntree_limit
preds = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)
XGBClassifier与xgb.DMatrix的差别
保存模型、加载模型、加载模型附带的参数如best_ntree_limit,以及feature_names
https://stackoverflow/questions/38212649/feature-importance-with-xgbclassifier
pipeline深度封装
####################
##########
f_names = clf.feature_names
df = df[f_names]
test_df = test_df[train_df.columns]
###############
pred_1 = clf.predict_proba(df1)
Traceback (most recent call last):
File "<ipython-input-83-2b2aad219a9d>", line 1, in <module>
pred_1 = clf.predict_proba(df1)
File "E:\WinPython\python-3.6.5.amd64\lib\site-packages\xgboost\sklearn.py", line 575, in predict_proba
ntree_limit=ntree_limit)
File "E:\WinPython\python-3.6.5.amd64\lib\site-packages\xgboost\core.py", line 1050, in predict
self._validate_features(data)
File "E:\WinPython\python-3.6.5.amd64\lib\site-packages\xgboost\core.py", line 1308, in _validate_features
data.feature_names))
ValueError: feature_names mismatch: ['age', 'yearsmarried', 'religiousness', 'education', 'occupation', 'rating', 'gender_female', 'gender_male', 'children_no', 'children_yes'] ['age', 'yearsmarried', 'religiousness', 'education', 'occupation', 'rating', 'gender_female', 'gender_male', 'children_no']
expected children_yes in input data
###########13
df11 = valid_df[X_train.columns]
Traceback (most recent call last):
File "<ipython-input-86-595c67c466c6>", line 1, in <module>
df11 = valid_df[X_train.columns]
File "E:\WinPython\python-3.6.5.amd64\lib\site-packages\pandas\core\frame.py", line 2133, in __getitem__
return self._getitem_array(key)
File "E:\WinPython\python-3.6.5.amd64\lib\site-packages\pandas\core\frame.py", line 2177, in _getitem_array
indexer = self.loc._convert_to_indexer(key, axis=1)
File "E:\WinPython\python-3.6.5.amd64\lib\site-packages\pandas\core\indexing.py", line 1269, in _convert_to_indexer
.format(mask=objarr[mask]))
KeyError: "Index(['children_yes'], dtype='object') not in index"
转载于:https://my.oschina/kyo4321/blog/1941132
本文标签: featurename
版权声明:本文标题:feature_name 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:https://m.elefans.com/xitong/1729531924a1204917.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论