GreenGrabage: October 2020

import pandas as pd

import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
warnings.simplefilter('ignore')
sns.set()

%matplotlib inline


# One-Hot encoding
for feature in ['cp_time', 'cp_type', 'cp_dose']:
    concat = pd.concat([train[feature], test[feature]], ignore_index=True)
    dummies = pd.get_dummies(concat, dummy_na=True, dtype=np.uint8, prefix=feature)
    train = pd.concat([train, dummies.iloc[:train.shape[0]]], axis=1)
    test = pd.concat([test, dummies.iloc[:test.shape[0]]], axis=1)


params = {'num_leaves': 491,
          'min_child_weight': 0.03,
          'feature_fraction': 0.3,
          'bagging_fraction': 0.4,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'binary_logloss',
          "verbosity": 0,
          'reg_alpha': 0.4,
          'reg_lambda': 0.6,
          'random_state': 47
         }



accumulative_loss = 0
skf = StratifiedKFold(n_splits=3, random_state=47, shuffle=True)

print('Execution time | Model number | logloss | new logloss | best coeff')
# 206 different models. One for each label
for model, target in enumerate(targets, 1):
    y = train_target[target]
    start_time = time()
    preds = np.zeros(test.shape[0])
    oof = np.zeros(X.shape[0])

    for trn_idx, test_idx in skf.split(X, y):
        
        trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
        clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds=20)
        oof[test_idx] = clf.predict(X.iloc[test_idx])
        preds += clf.predict(test[features]) / skf.n_splits

    loss = log_loss(y, oof)
    
    # Hacking the metric
    coeffs = [3, 2, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0, 0.9, 0.8, 0.7]
    best_coeff = 0
    best_loss = loss
    for coeff in coeffs:
        new_oof = oof.copy()
        new_oof[new_oof < new_oof.mean() / coeff] = 0
        new_loss = log_loss(y, new_oof)
        if new_loss < loss:
            preds[preds < preds.mean() / coeff] = 0
            best_coeff = coeff
            best_loss = new_loss
    
    if best_coeff:
        preds[preds < preds.mean() / best_coeff] = 0
    # End of metric hacking
    sub[target] = preds

    accumulative_loss += best_loss
    print('{}\t\t{}\t{:.5f}\t\t{:.5f}\t\t{}'.format(str(datetime.timedelta(seconds=time() - start_time))[:7], model, loss, best_loss, best_coeff))
    del preds, oof, start_time, y, loss, best_loss, new_oof
    gc.collect();


print('Overall mean loss: {:.5f}'.format(accumulative_loss / 206))

GreenGrabage

Monday, 26 October 2020

LIGNT-GBM FOR MULTI-LABEL