dataset rank
https://datahack.analyticsvidhya.com/contest/janatahack-machine-learning-in-agriculture/#LeaderBoard
import pandas as pd
train=pd.read_csv("/content/drive/My Drive/train.csv")
# test=pd.read_csv("/content/drive/My Drive/test.csv")
# test.head(1)
# train=test
train.head(1)
from sklearn.impute import SimpleImputer
import numpy as np
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
train["Number_Weeks_Used"]=imp.fit_transform(train["Number_Weeks_Used"].values.reshape(-1,1))
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scale_cols=['Estimated_Insects_Count','Number_Weeks_Used','Number_Doses_Week','Number_Weeks_Quit']
def scale_down(col):
train[col] = scaler.fit_transform(train[col].values.reshape(-1,1))
for c in scale_cols:
scale_down(c)
train.head(2)
data=pd.get_dummies(train, prefix=['Crop_Type', 'Soil_Type','Pesticide_Use_Category','Season'], columns=['Crop_Type', 'Soil_Type','Pesticide_Use_Category','Season'])
data.head(2)
X_df, y_df = data.drop(['Crop_Damage','ID'],axis=1),data['Crop_Damage']
# X_df = data.drop(['ID'],axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=123)
# X_train.head(1)
import xgboost as xgb
from sklearn.metrics import mean_squared_error
xg_reg = xgb.XGBClassifier(n_estimators = 500)
xg_reg.fit(X_train,y_train)
type(X_train),type(X_test)
X_test=X_test.values
type(X_test)
X_train.shape,X_test.shape
preds = xg_reg.predict(X_test)
# preds = xg_reg.predict(X_df)
# rmse = np.sqrt(mean_squared_error(y_test, preds))
# print("RMSE: %f" % (rmse))
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# sub1=pd.DataFrame({"ID":train['ID'],"Crop_Damage":preds})
# sub1.to_csv("sub1.csv",index=False)
## sampling technique
from sklearn.utils import resample
X = pd.concat([X_train, y_train], axis=1)
X.head()
z = X[X.Crop_Damage==0]
o = X[X.Crop_Damage==1]
t = X[X.Crop_Damage==2]
# upsample minority
fraud_upsampled = resample(z, # class to downsample and upsample
replace=True, # sample with replacement
n_samples=len(t), # match number in majority class / minority class
random_state=27) # reproducible results
# combine majority and upsampled minority
upsampled = pd.concat([t, fraud_upsampled])
# check new class counts
upsampled.Crop_Damage.value_counts()
upsampled.shape
X_train_up=upsampled.drop('Crop_Damage',axis=1)
y_train_up=upsampled.Crop_Damage
xg_reg.fit(X_train_up,y_train_up)
y_train_up.head(1)
**UPSAMPLING**---69 %
** DOWNSAMPLING** 67 %
** SMOT ** 82%
** NO CHANEGE ** 84
from imblearn.over_sampling import SMOTE
sm = SMOTE('minority')
X_train, y_train = sm.fit_sample(X_train, y_train)
X_train.shape
X_train[0]
No comments:
Post a Comment