Sunday, 26 July 2020

tabular data sampling and smoth technique

dataset rank


import pandas as pd train=pd.read_csv("/content/drive/My Drive/train.csv") # test=pd.read_csv("/content/drive/My Drive/test.csv") # test.head(1) # train=test train.head(1) from sklearn.impute import SimpleImputer import numpy as np imp = SimpleImputer(missing_values=np.nan, strategy='mean') train["Number_Weeks_Used"]=imp.fit_transform(train["Number_Weeks_Used"].values.reshape(-1,1)) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scale_cols=['Estimated_Insects_Count','Number_Weeks_Used','Number_Doses_Week','Number_Weeks_Quit'] def scale_down(col): train[col] = scaler.fit_transform(train[col].values.reshape(-1,1)) for c in scale_cols: scale_down(c) train.head(2) data=pd.get_dummies(train, prefix=['Crop_Type', 'Soil_Type','Pesticide_Use_Category','Season'], columns=['Crop_Type', 'Soil_Type','Pesticide_Use_Category','Season']) data.head(2) X_df, y_df = data.drop(['Crop_Damage','ID'],axis=1),data['Crop_Damage'] # X_df = data.drop(['ID'],axis=1) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=123) # X_train.head(1) import xgboost as xgb from sklearn.metrics import mean_squared_error xg_reg = xgb.XGBClassifier(n_estimators = 500) xg_reg.fit(X_train,y_train) type(X_train),type(X_test) X_test=X_test.values type(X_test) X_train.shape,X_test.shape preds = xg_reg.predict(X_test) # preds = xg_reg.predict(X_df) # rmse = np.sqrt(mean_squared_error(y_test, preds)) # print("RMSE: %f" % (rmse)) from sklearn.metrics import accuracy_score accuracy = accuracy_score(y_test, preds) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # sub1=pd.DataFrame({"ID":train['ID'],"Crop_Damage":preds}) # sub1.to_csv("sub1.csv",index=False) ## sampling technique from sklearn.utils import resample X = pd.concat([X_train, y_train], axis=1) X.head() z = X[X.Crop_Damage==0] o = X[X.Crop_Damage==1] t = X[X.Crop_Damage==2] # upsample minority fraud_upsampled = resample(z, # class to downsample and upsample replace=True, # sample with replacement n_samples=len(t), # match number in majority class / minority class random_state=27) # reproducible results # combine majority and upsampled minority upsampled = pd.concat([t, fraud_upsampled]) # check new class counts upsampled.Crop_Damage.value_counts() upsampled.shape X_train_up=upsampled.drop('Crop_Damage',axis=1) y_train_up=upsampled.Crop_Damage xg_reg.fit(X_train_up,y_train_up) y_train_up.head(1) **UPSAMPLING**---69 % ** DOWNSAMPLING** 67 % ** SMOT ** 82% ** NO CHANEGE ** 84 from imblearn.over_sampling import SMOTE sm = SMOTE('minority') X_train, y_train = sm.fit_sample(X_train, y_train) X_train.shape X_train[0]

No comments:

Post a Comment