在Azure ML中运行端点测试时,我遇到了一个与读取输入数据有关的错误。
采取的步骤如下:
的代码
Train.py代码
%%writefile $script_folder/train.py
import argparse
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
import joblib
import pickle
from azureml.core import Workspace, Dataset, Experiment
from azureml.core import Run
import re
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
import math
import pickle
#ws = Workspace.from_config()
#az_dataset = Dataset.get_by_name(ws, 'pricing')
# let user feed in 2 parameters, the location of the data files (from datastore), and the regularization rate of the logistic regression model
#parser = argparse.ArgumentParser()
#parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
#parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
#args = parser.parse_args()
train_data = pd.read_csv("C:\\Users\\abhay\\Downloads\\Projects_DataScience\\Ensemble_Machine_Learning\\dataset\\train_update.csv")
column_datatypes = train_data.dtypes
categorical_columns = list(column_datatypes[column_datatypes=="object"].index.values)
continuous_columns = list(column_datatypes[column_datatypes=="float64"].index.values)
continuous_columns.remove('loss')
total_rows = train_data.shape[0]
columns_with_blanks_cat = np.random.randint(1,116,2)
columns_with_blanks_cont = np.random.randint(117,130,3)
columns_with_blank = np.append(columns_with_blanks_cat,columns_with_blanks_cont)
#for every column insert 5 blanks at random locations
for col in columns_with_blank:
rows_with_blanks = np.random.randint(1,total_rows,5)
train_data.iloc[rows_with_blanks,col] = np.nan
class Data_preprocessing:
def __init__(self,train_data):
self.train_data = train_data
def missing_value_continuous(self,column_names_with_specific_type,imputation_type="mean"): # null value imputation with mean value
if imputation_type=="mean": # mean imputation
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mean_imputer.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type]=mean_imputer.transform(self.train_data[column_names_with_specific_type])
if imputation_type=="median": # median imputation
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
median_imputer.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type]=median_imputer.transform(self.train_data[column_names_with_specific_type])
return self.train_data
def missing_value_categorical(self,column_names_with_specific_type,imputation_type="most_frequent"): # check for missing categorical column values
most_frequent = SimpleImputer(strategy="most_frequent")
most_frequent.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type] = most_frequent.transform(train_data[column_names_with_specific_type])
return self.train_data
def outlier_treatment(self,Q1,Q3,IQR,columns_with_outlier,action): # outlier treatmenr
if action=="median":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
meadian_outlier = np.median(self.train_data[column_name])
self.train_data.loc[self.train_data[((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))].index,column_name]=meadian_outlier
if action=="mean":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
mean_outlier = np.mean(self.train_data[column_name])
self.train_data.loc[self.train_data[((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))].index,column_name]=mean_outlier
if action=="remove":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
self.train_data = self.train_data[~((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))]
return self.train_data
column_names = np.array(train_data.columns)
Data_preprocessing_obj = Data_preprocessing(train_data)
train_data = Data_preprocessing_obj.missing_value_continuous(continuous_columns,"median")
train_data = Data_preprocessing_obj.missing_value_categorical(categorical_columns)
columns_with_outlier = ['cont7','cont9','cont10']
Q1 = train_data[continuous_columns].quantile(0.25)
Q3 = train_data[continuous_columns].quantile(0.75)
IQR = (Q3-Q1)
train_data = Data_preprocessing_obj.outlier_treatment(Q1,Q3,IQR,columns_with_outlier,"median")
def feature_selection_numerical_variables(train_data,qthreshold,corr_threshold,exclude_numerical_cols_list):
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = list(train_data.select_dtypes(include=num_colums).columns)
numerical_columns = [column for column in numerical_columns if column not in exclude_numerical_cols_list]
#remove variables with constant variance
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(train_data[numerical_columns])
constant_columns = [column for column in train_data[numerical_columns].columns
if column not in train_data[numerical_columns].columns[constant_filter.get_support()]]
if len(constant_columns)>0:
train_data.drop(labels=constant_columns, axis=1, inplace=True)
#remove deleted columns from dataframe
numerical_columns = [column for column in numerical_columns if column not in constant_columns]
#remove variables with qconstant variance
#Remove quasi-constant variables
qconstant_filter = VarianceThreshold(threshold=qthreshold)
qconstant_filter.fit(train_data[numerical_columns])
qconstant_columns = [column for column in train_data[numerical_columns].columns
if column not in train_data[numerical_columns].columns[constant_filter.get_support()]]
if len(qconstant_columns)>0:
train_data.drop(labels=qconstant_columns, axis=1, inplace=True)
#remove deleted columns from dataframe
numerical_columns = [column for column in numerical_columns if column not in qconstant_columns]
#remove correlated variables
correlated_features = set()
correlation_matrix = train_data[numerical_columns].corr()
ax = sns.heatmap(
correlation_matrix,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right');
#print(correlation_matrix)
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > corr_threshold:
colname = correlation_matrix.columns[i]
colcompared = correlation_matrix.columns[j]
#check if the column compared against is not in the columns excluded list
if colcompared not in correlated_features:
correlated_features.add(colname)
train_data.drop(labels=correlated_features, axis=1, inplace=True)
return train_data,constant_columns,qconstant_columns,correlated_features
train_data,constant_columns,qconstant_columns,correlated_features =feature_selection_numerical_variables(train_data,0.01,0.75,['loss','id'],)
for cf1 in categorical_columns:
le = LabelEncoder()
le.fit(train_data[cf1].unique())
filename = cf1+".sav"
pickle.dump(le, open(filename, 'wb'))
train_data[cf1] = le.transform(train_data[cf1])
#snippet to calculate the unique values with a categorical columns
df = pd.DataFrame(columns=["Column_Name","Count"])
for cat in categorical_columns:
unique_value_count = len(train_data[cat].unique())
df = df.append({'Column_Name': cat, "Count":int(unique_value_count)}, ignore_index=True)
columns_unique_value = np.array(df.Count.value_counts().index)
#snippet to identify the dependent/correlated categorical variables and drop them
columns_to_drop_cat = set()
correlated_columns = dict()
for unique_value_count in columns_unique_value:
if unique_value_count>1:
categorical_columns = df.loc[df.Count==unique_value_count,'Column_Name']
categorical_columns = categorical_columns.reset_index(drop=True)
columns_length=len(categorical_columns)
for col in range(columns_length-1):
column_to_compare = categorical_columns[col]
columns_compare_against = categorical_columns[(col+1):columns_length]
chi_scores = chi2(train_data[columns_compare_against],train_data[column_to_compare])
if column_to_compare not in columns_to_drop_cat:
columns_to_be_dropped = [i for i in range(len(columns_compare_against)) if chi_scores[1][i]<=0.05]
columns_to_drop_array = np.array(columns_compare_against)[columns_to_be_dropped]
correlated_columns[column_to_compare]=columns_to_drop_array
columns_to_drop_cat.update(columns_to_drop_array)
train_data = train_data.drop(columns_to_drop_cat,axis=1)
correlated_features = list(correlated_features)
columns_to_drop_cat = list(columns_to_drop_cat)
columns_to_drop_cat.extend(correlated_features)
columns_to_drop = columns_to_drop_cat.copy()
#output the columns_to_drop file to a csv
columns_to_drop_df=pd.DataFrame(columns_to_drop,columns=['colnames'])
#columns_to_drop_df.to_csv("/model/columns_to_drop.csv",index=False)
train_data['loss'] = np.log(train_data['loss'])
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
#convert the int64 columns categorical
Column_datatypes= train_data.dtypes
Integer_columns = list(Column_datatypes.where(lambda x: x =="int64").dropna().index.values)
train_data[Integer_columns] = train_data[Integer_columns].astype('category',copy=False)
X,y = train_data.drop(['id','loss'],axis=1),train_data['loss']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # perform train test split
ref_cols=X_train.columns
from sklearn.ensemble import GradientBoostingRegressor #GBM algorithm
gbm_base = GradientBoostingRegressor(
max_depth=2,
n_estimators=3,
learning_rate=1.0)
trained_model=gbm_base.fit(X_train,y_train)
# Predict the outcome using Test data - Score Model
Y_test_predict_tuned = gbm_base.predict(X_test)
# Get the probability score - Scored Probabilities
#Y_prob = gbm_base.predict_proba(X_test)[:, 1]
# Get Confusion matrix and the accuracy/score - Evaluate
score =np.sqrt(mean_squared_error(y_test, Y_test_predict_tuned))
#print('Export the model to model.pkl')
#f = open('fwrk2.pkl', 'wb')
#pickle.dump(trained_model, f)
#f.close()
#print('Import the model from model.pkl')
#f2 = open('fwrk2.pkl', 'rb')
#clf2 = pickle.load(f2)
#X_new = [[154, 54, 35]]
#print('New Sample:', X_new)
#print('Predicted class:', clf2.predict(X_new))
#os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
#joblib.dump(value=trained_model, filename='outputs/fwrk2.pkl')
读取score.py
%%writefile score.py
import json
import numpy as np
import os
import pickle
import pandas as pd
import joblib
from sklearn.ensemble import GradientBoostingRegressor
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType
from azureml.core.model import Model
def init():
global model
#model = joblib.load('recommender.pkl')
model_path = Model.get_model_path('fwrk2')
model = joblib.load(model_path)
input_sample = pd.DataFrame(data=[{"cat1":0, "cat4": 0, "cat14": 0, "cat15": 0, "cat18": 0, "cat19": 0, "cat20": 0, "cat21": 0
, "cat22": 0, "cat35": 0, "cat42":0, "cat47": 0, "cat48": 0, "cat55": 0
, "cat56": 0, "cat58": 0, "cat59": 0, "cat60": 0, "cat61": 0, "cat62": 0
, "cat63": 0, "cat64": 0, "cat68": 0, "cat70": 0, "cat76": 0, "cat77":0
, "cat78": 0, "cat82": 0, "cat85": 0, "cat86": 0, "cat89": 0, "cat91": 0
, "cat92": 0, "cat93": 0, "cat94":0, "cat96": 0, "cat97": 0, "cat99": 0
, "cat100": 0, "cat101": 0, "cat103": 0, "cat105": 0, "cat107": 0, "cat109":0
, "cat110": 0, "cat111": 0, "cat112": 0, "cat113": 0, "cat116": 0, "cont1": 0
, "cont2": 0, "cont3": 0, "cont4": 0, "cont5": 0
, "cont6": 0, "cont7": 0, "cont8": 0, "cont14": 0}])
output_sample = np.array([0]) # This is a integer type sample. Use the data type that reflects the expected result
@input_schema('data', PandasParameterType(input_sample))
@output_schema(NumpyParameterType(output_sample))
def run(data):
try:
result = model.predict(data)
# you can return any datatype as long as it is JSON-serializable
return result.tolist()
except Exception as e:
error = str(e)
return error
端点发布成功,我可以在azure门户上看到测试特性以输入值,post输入值。
[{"cat1":0, "cat4": 0, "cat14": 0, "cat15": 0, "cat18": 0, "cat19": 0, "cat20": 0, "cat21": 0
, "cat22": 0, "cat35": 0, "cat42":0, "cat47": 0, "cat48": 0, "cat55": 0
, "cat56": 0, "cat58": 0, "cat59": 0, "cat60": 0, "cat61": 0, "cat62": 0
, "cat63": 0, "cat64": 0, "cat68": 0, "cat70": 0, "cat76": 0, "cat77":0
, "cat78": 0, "cat82": 0, "cat85": 0, "cat86": 0, "cat89": 0, "cat91": 0
, "cat92": 0, "cat93": 0, "cat94":0, "cat96": 0, "cat97": 0, "cat99": 0
, "cat100": 0, "cat101": 0, "cat103": 0, "cat105": 0, "cat107": 0, "cat109":0
, "cat110": 0, "cat111": 0, "cat112": 0, "cat113": 0, "cat116": 0, "cont1": 0
, "cont2": 0, "cont3": 0, "cont4": 0, "cont5": 0
, "cont6": 0, "cont7": 0, "cont8": 0, "cont14": 0}])
错误:"'GradientBoostingRegressor‘对象没有属性’n_features‘
请有人指点一下,在执行上面的输入样例时会出现什么问题?它是否与软件包的版本有关,如果是,那么如何更新和解决它?
发布于 2022-09-22 11:06:46
GradientBoostingRegressor
将获取字典的值,并将元组的值替换为字典。用下面的代码块替换渐变下降代码。下面是当前存在的块
gbm_base = GradientBoostingRegressor(
max_depth=2,
n_estimators=3,
learning_rate=1.0)
用下面的代码块替换上面的代码块。
gbm_base = {
“max_depth”:2,
“n_estimators”:3,
“”learning_rate:1.0,
}
gbm = GradientBoostingRegressor(**gbm_base)
使用GadientBoostingRegressor
的gbm变量进一步利用的特性
编辑1:可选过程
我试着用我的样本来重现这个问题,而GradientBoostingRegressor没有记录任何问题。签出提到的代码块。
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
X, y = make_regression(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train, y_train)
Output: GradientBoostingRegressor(random_state=0)
reg.predict(X_test[1:2])
output: array([-61...])
reg.score(X_test, y_test)
Output: 0.4...
https://stackoverflow.com/questions/73812159
复制相似问题