import pandas as pd
import numpy as np
import math


df=pd.read_csv('10513.01.10.2021.31.12.2021.1.0.0.en.utf8.00000000.csv',\
               sep=';',encoding='utf8',header=6,index_col=False)
df=df.iloc[::-1]
df.set_index('Local time in Cologne / Bonn (airport)',inplace=True)
df.head(3)


df.shape

(2201, 28)


# drop the columns with any nan values
df.dropna(axis=1,how="any",inplace=True)
# list the cases of present weather conditions
weather_unique=df['WW'].unique().tolist()


for i in range(0,len( weather_unique)):
    if i==0:
        df.replace( weather_unique[i], math.nan, inplace=True) 
    elif weather_unique[i].startswith("Clouds"):
        df.replace( weather_unique[i], 'Clouds', inplace=True) 
    elif weather_unique[i].startswith("Fog") or weather_unique[i].startswith("Mist") or \
    weather_unique[i].startswith("More" ) or weather_unique[i].startswith("Patches" ):
        df.replace(weather_unique[i], 'Fog', inplace=True) 
    elif weather_unique[i].startswith("Snow") or weather_unique[i].startswith("Isolated") or \
    weather_unique[i].startswith( "Intermittent"): 
        df.replace(weather_unique[i], 'Snow', inplace=True) 
    elif weather_unique[i].startswith("State"): 
        df.replace(weather_unique[i], math.nan, inplace=True)
    elif weather_unique[i].startswith("Precipitation") or  weather_unique[i].startswith("Rain" ) or \
    weather_unique[i].startswith("Drizzle") or weather_unique[i].startswith("Shower"): 
        df.replace(weather_unique[i], 'Rain', inplace=True)


# fill the missing value by the last valid observation
df.fillna(method='ffill',inplace=True)
# fill the missing value at the beginning by the next valid observation
df.fillna(method='bfill',inplace=True)


# list the counts of weather conditions
df['WW'].value_counts()

Rain      1198
Fog        584
Clouds     410
Snow         9
Name: WW, dtype: int64


# list the counts of wind directions
df['DD'].value_counts()

Wind blowing from the east-southeast     368
Wind blowing from the south-east         303
Wind blowing from the south              262
Wind blowing from the west               223
Wind blowing from the south-southeast    190
Wind blowing from the east               178
Wind blowing from the south-southwest    151
Wind blowing from the west-southwest     119
Wind blowing from the south-west         114
Wind blowing from the west-northwest      59
Wind blowing from the east-northeast      59
Wind blowing from the north-west          52
Wind blowing from the north-northwest     40
Wind blowing from the north               36
Wind blowing from the north-east          21
Wind blowing from the north-northeast     16
Calm, no wind                             10
Name: DD, dtype: int64


from sklearn.preprocessing import LabelEncoder

dff=df.copy()
lw = LabelEncoder()
dff['WW'] = lw.fit_transform(dff['WW'])
lw_name_mapping = dict(zip(lw.classes_, lw.transform(lw.classes_)))
print(lw_name_mapping)

{'Clouds': 0, 'Fog': 1, 'Rain': 2, 'Snow': 3}


ld = LabelEncoder()
dff['DD'] = ld.fit_transform(dff['DD'])
ld_name_mapping = dict(zip(ld.classes_, ld.transform(ld.classes_)))
print(ld_name_mapping)

{'Calm, no wind': 0, 'Wind blowing from the east': 1, 'Wind blowing from the east-northeast': 2, 'Wind blowing from the east-southeast': 3, 'Wind blowing from the north': 4, 'Wind blowing from the north-east': 5, 'Wind blowing from the north-northeast': 6, 'Wind blowing from the north-northwest': 7, 'Wind blowing from the north-west': 8, 'Wind blowing from the south': 9, 'Wind blowing from the south-east': 10, 'Wind blowing from the south-southeast': 11, 'Wind blowing from the south-southwest': 12, 'Wind blowing from the south-west': 13, 'Wind blowing from the west': 14, 'Wind blowing from the west-northwest': 15, 'Wind blowing from the west-southwest': 16}


dff.head()


dff.dtypes

T     float64
Po    float64
P     float64
U       int64
DD      int64
Ff      int64
WW      int64
Td    float64
dtype: object


# feature variables
df_x=dff[{'T','Po','P','U','DD','Ff','Td'}]
# predicted variable
df_y=dff[{'WW'}]


# data description for features
round(df_x.describe().T, 2)


#from scipy import stats 
#from scipy.stats import norm, expon, cauchy
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedShuffleSplit
import seaborn as sns
sns.set_context("paper", font_scale=1.5)
sns.set_style('white')


# transform features by scaling each feature to a range between zero and one.
x_scaled = MinMaxScaler().fit_transform(df_x)
df_x = pd.DataFrame(x_scaled, index=df_x.index, columns=df_x.columns)
df_x.head()


# data matrix of transpose of df3
ax = plt.subplots(figsize=(15, 5))
ax=sns.heatmap(df_x.T,cmap="magma_r").set(ylabel='',xlabel='')


# Calculate the correlations between the feature variables.
corr_values = df_x.corr()

# Simplify by emptying all the data below the diagonal
tril_index = np.tril_indices_from(corr_values)

# Make the unused values NaNs
for coord in zip(*tril_index):
    corr_values.iloc[coord[0], coord[1]] = np.NaN
    
# Stack the data and convert to a data frame
corr_values = (corr_values
               .stack()
               .to_frame()
               .reset_index()
               .rename(columns={'level_0':'feature1',
                                'level_1':'feature2',
                                0:'correlation'}))

# Get the absolute values for sorting
corr_values['abs_correlation'] = corr_values.correlation.abs()
corr_values.head()


# draw a histogram of the absolute value correlations.
ax = corr_values.abs_correlation.hist(bins=15, figsize=(10, 6))
ax.set(xlabel='Absolute Correlation', ylabel='Frequency');


# The most highly correlated values
corr_values.sort_values('correlation', ascending=False).query('abs_correlation>0.8')


# Split the data into train and test data sets.
# Get the split indexes
strat_shuf_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, test_idx = next(strat_shuf_split.split(df_x, df_y))

# Create the dataframes
X_train = df_x.iloc[train_idx.tolist(),:]
y_train = df_y.iloc[train_idx.tolist(),:]

X_test  = df_x.iloc[test_idx.tolist(),:]
y_test  = df_y.iloc[test_idx.tolist(),:]


y_train.value_counts(normalize=True)

WW
2     0.544156
1     0.265584
0     0.186364
3     0.003896
dtype: float64


y_test.value_counts(normalize=True)

WW
2     0.544629
1     0.264750
0     0.186082
3     0.004539
dtype: float64


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV


# Standard logistic regression
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
# L1 regularized logistic regression with cross validation of 4 folds
lr_l1 = LogisticRegressionCV(Cs=10, cv=4, penalty='l1', solver='liblinear').fit(X_train, y_train)
# L2 regularized logistic regression with cross validation of 4 folds
lr_l2 = LogisticRegressionCV(Cs=10, cv=4, penalty='l2', solver='liblinear').fit(X_train, y_train)


# Combine all the coefficients into a dataframe
coefficients = list()
coeff_labels = ['lr', 'l1', 'l2']
coeff_models = [lr, lr_l1, lr_l2]
for lab,mod in zip(coeff_labels, coeff_models):
    coeffs = mod.coef_
    coeff_label = pd.MultiIndex(levels=[[lab], [0,1,2,3]], 
                                 codes=[[0,0,0,0], [0,1,2,3]])
    coefficients.append(pd.DataFrame(coeffs.T, columns=coeff_label))
coefficients = pd.concat(coefficients, axis=1)
coefficients


# draw coefficients versus 7 features for each of the multi-class coefficients.
fig,axes=plt.subplots(2,2,figsize=(12, 8))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    data = coefficients.xs(idx, level=1, axis=1) # extract the coefficients for each class at level 1 of columns
    data.plot(marker='o', ls='', ms=6.0, ax=ax, legend=False)
    if idx==3:
        ax.legend(loc=4)
    ax.set(title='Coefficient Set '+str(idx), xlabel="Index of features",  ylabel="Coefficient")
plt.tight_layout()


# Predict the class and the probability for each
y_pred = list()
y_prob = list()
for lab,mod in zip(coeff_labels, coeff_models):
    y_pred.append(pd.Series(mod.predict(X_test), name=lab))
    y_prob.append(pd.Series(mod.predict_proba(X_test).max(axis=1), name=lab))    
y_pred = pd.concat(y_pred, axis=1)
y_prob = pd.concat(y_prob, axis=1)
y_pred.head()


from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize


# calculate error metrics for each model, i.e., standard logistic regression (lr), 
# L1 regularized logistic regression (l1) and L2 regularized logistic regression (l2)  

metrics_lrs = list() 
cm_lrs = dict()
for lab in coeff_labels:
    # calculate precision, recall, f-score from the multi-class support function
    precision, recall, fscore, _ = score(y_test, y_pred[lab], average='weighted')    
    # calculate accuracy
    accuracy = accuracy_score(y_test, y_pred[lab])    
    # calculate ROC-AUC scores by binarizing the data
    auc = roc_auc_score(label_binarize(y_test, classes=[0,1,2,3]),
              label_binarize(y_pred[lab], classes=[0,1,2,3]), 
              average='weighted')    
    # calcualte the confusion matrix
    cm_lrs[lab] = confusion_matrix(y_test, y_pred[lab])
    metrics_lrs.append(pd.Series({'precision':precision, 'recall':recall, 
                              'fscore':fscore, 'accuracy':accuracy,
                              'auc':auc}, name=lab))
metrics_lrs = pd.concat(metrics_lrs, axis=1)


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score


# search for the best K value
max_k = 40
f1_scores = list()
error_rates = list() # 1-accuracy
for k in range(1, max_k):  
    # estimate KNN model
    knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
    knn = knn.fit(X_train, y_train)
    # predict with KNN model
    y_pred = knn.predict(X_test)
    # calculate F1-score and error rates
    f1 = f1_score(y_pred, y_test, average='weighted')
    f1_scores.append((k, round(f1, 4)))
    error = 1-round(accuracy_score(y_test, y_pred), 4)
    error_rates.append((k, error))
f1_results = pd.DataFrame(f1_scores, columns=['K', 'F1 Score'])
error_results = pd.DataFrame(error_rates, columns=['K', 'Error Rate'])


# Plot F1 and error results
fig, axes=plt.subplots(1,2,figsize=(15,5))
axes = axes.flatten()
f1_results.set_index('K').plot(color='blue', marker='o',linewidth=2, ax=axes[0])
axes[0].set(title='KNN F1 Score', xlabel='K', ylabel='F1 Score')
error_results.set_index('K').plot(color='blue', marker='o', linewidth=2, ax=axes[1])
axes[1].set(title='KNN Elbow Curve', xlabel='K', ylabel='Error Rate');


def measure_error(y_true, y_pred, modelname):
    # calculate precision, recall, f-score, accuracy, ROC-AUC scores 
    precision, recall, fscore, _ = score(y_true, y_pred, average='weighted')     
    accuracy = accuracy_score(y_true, y_pred)    
    auc = roc_auc_score(label_binarize(y_true, classes=[0,1,2,3]),
                        label_binarize(y_pred, classes=[0,1,2,3]), 
                        average='weighted')    
    # calcualte the confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    metric=pd.Series({'precision':precision, 'recall':recall, 
                                  'fscore':fscore, 'accuracy':accuracy,
                                  'auc':auc}, name=modelname).to_frame()
    return metric,cm


# fit KNN model with K=3 and Minkowski distance
knn = KNeighborsClassifier(n_neighbors=3, weights='distance') 
knn = knn.fit(X_train, y_train)
# predict with KNN model
y_pred = knn.predict(X_test)
# calculate precision, recall, f-score, accuracy, ROC-AUC scores, and confusion matrix 
metric_knn, cm_knn=measure_error(y_test, y_pred, 'knn')


from sklearn.svm import SVC


# draw scatter plot of the first two and three features with the points colored according to categories
# map each category to a color 
def colormap(x):
    if x==0:
        return 'red'
    elif x==1:
        return 'blue'
    elif x==2:
        return 'green'
    elif x==3:
        return 'yellow'
X_color = df_x.sample(500, random_state=45)
y_color = df_y.loc[X_color.index]
y_color = y_color.applymap(lambda r: colormap(r) )

fig = plt.figure(figsize=(15,7))
ax0 = fig.add_subplot(1,2,1)
ax0.scatter(X_color.iloc[:, 0], X_color.iloc[:, 1], color=y_color.iloc[:,0].to_list(), alpha=1)
ax0.set(xlabel=df_x.columns[0], ylabel=df_x.columns[1], xlim=[0, 1], ylim=[0, 1]);
ax1 = fig.add_subplot(1,2,2, projection='3d')
ax1.scatter(X_color.iloc[:, 0], X_color.iloc[:, 1], X_color.iloc[:, 1], marker='o', 
            alpha=1,edgecolor=y_color.iloc[:,0].to_list(),color=y_color.iloc[:,0].to_list())
ax1.set(xlabel=df_x.columns[0], ylabel=df_x.columns[1], zlabel=df_x.columns[2], 
        xlim=[0, 1], ylim=[0, 1], zlim=[0, 1]);


# fit SVC model
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
# calculate precision, recall, f-score, accuracy, ROC-AUC scores, and confusion matrix 
metric_svc, cm_svc=measure_error(y_test, y_pred, 'svc')


#!pip install pydotplus
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from io import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus


# fit the dicision tree model
dt = DecisionTreeClassifier(random_state=37)
dt = dt.fit(X_train, y_train)
# The number of nodes and the maximum actual depth.
dt.tree_.node_count, dt.tree_.max_depth

(657, 17)


# use grid search with cross validation to find a decision tree that performs well on the train data set
param_grid = {'max_depth':range(1, dt.tree_.max_depth+1, 2),
              'max_features': range(1, len(dt.feature_importances_)+1)}
GR = GridSearchCV(DecisionTreeClassifier(random_state=37),
                  param_grid=param_grid,
                  scoring='accuracy',
                  n_jobs=-1)
GR = GR.fit(X_train, y_train)

# the number of nodes and the depth of the tree with the best estimator.
GR.best_estimator_.tree_.node_count, GR.best_estimator_.tree_.max_depth

(661, 15)


dt=GR.best_estimator_
dt.tree_.node_count, dt.tree_.max_depth

(661, 15)


# predict using the dicision tree model with the best estimator
y_pred = dt.predict(X_test)
# calculate precision, recall, f-score, accuracy, ROC-AUC scores, and confusion matrix 
metric_dt, cm_dt=measure_error(y_test, y_pred, 'dt')


# create an output destination for the file
dot_data = StringIO()
export_graphviz(dt, out_file=dot_data, filled=True, max_depth=5, 
                class_names=['Clouds', 'Fog', 'Rain', 'Snow'],feature_names=df_x.columns.tolist())
# GraphViz need to be installed with a package manager on Linux and Mac
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# view the tree image
filename = 'wine_tree.png'
graph.write_png(filename)
Image(filename=filename)


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier


# fit random forest models with a range of tree numbers 
# and evaluate the out-of-bag error for each of these models.

# initialize the random forest estimator
RF = RandomForestClassifier(oob_score=True, random_state=37, warm_start=True,n_jobs=-1)
oob_list = list()
# search the best number of trees
for n_trees in range(20,500,10):    
    # set the number of trees
    RF.set_params(n_estimators=n_trees)
    # fit the model
    RF.fit(X_train, y_train)
    # evaluate the out-of-bag error
    oob_error = 1 - RF.oob_score_    
    # store it
    oob_list.append(pd.Series({'n_trees': n_trees, 'oob': oob_error}))
rf_oob_df = pd.concat(oob_list, axis=1).T.set_index('n_trees')


# number of trees with the minimal out-of-bag error
n_tree=int(rf_oob_df.idxmin().values[0])
n_tree

230


rf = RandomForestClassifier(oob_score=True, n_estimators=n_tree, random_state=37, warm_start=True, n_jobs=-1)
# fit the model
rf.fit(X_train, y_train)
# predict using the dicision tree model with the best estimator
y_pred = rf.predict(X_test)
# calculate precision, recall, f-score, accuracy, ROC-AUC scores, and confusion matrix 
metric_rf, cm_rf=measure_error(y_test, y_pred, 'rf')


# fit extra random forest models with a range of tree numbers 
# and evaluate the out-of-bag error for each of these models.

# initialize the extra random forest estimator
EF = ExtraTreesClassifier(oob_score=True, random_state=37, warm_start=True, bootstrap=True, n_jobs=-1)
oob_list = list()
# search the best number of trees
for n_trees in range(20,500,10):    
    # set the number of trees
    EF.set_params(n_estimators=n_trees)
    # fit the model
    EF.fit(X_train, y_train)
    # evaluate the out-of-bag error
    oob_error = 1 - EF.oob_score_
    oob_list.append(pd.Series({'n_trees': n_trees, 'oob': oob_error}))
ef_oob_df = pd.concat(oob_list, axis=1).T.set_index('n_trees')


# number of trees with the minimal out-of-bag error
n_tree=int(ef_oob_df.idxmin().values[0]) 
n_tree

250


ef = ExtraTreesClassifier(oob_score=True, random_state=37, n_estimators=n_tree, warm_start=True,
                          bootstrap=True, n_jobs=-1)
# fit the model
ef.fit(X_train, y_train)
# predict using the dicision tree model with the best estimator
y_pred = ef.predict(X_test)
# calculate precision, recall, f-score, accuracy, ROC-AUC scores, and confusion matrix 
metric_ef, cm_ef=measure_error(y_test, y_pred, 'ef')


# draw out-of-bag error versus the number of trees
fig,ax=plt.subplots(figsize=(10,6))
rf_oob_df.plot(marker='o', color='red', linewidth=2, legend=True, logy=True,ax=ax)\
.set(ylabel='out-of-bag error', xlabel='n trees')
ef_oob_df.plot(marker='o', color='blue', linewidth=2, legend=True, logy=True,ax=ax)\
.set(ylabel='out-of-bag error', xlabel='n trees')
ax.legend(['random forest', 'extra random forest']);


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier


# fit gradient boosted tree models
# parameters to be fit
param_grid = {'n_estimators': range(50,500,50),
              'learning_rate': [0.01, 0.001],
              'subsample': [1.0, 0.5],
              'max_features': [1, 2, 3]}
# grid search object 
GV_GBC = GridSearchCV(GradientBoostingClassifier(random_state=37), 
                      param_grid=param_grid, 
                      scoring='accuracy',
                      n_jobs=-1)
# the grid search with default 5-fold cross validation
GV_GBC = GV_GBC.fit(X_train, y_train)


# The best model
gbc=GV_GBC.best_estimator_
gbc

GradientBoostingClassifier(learning_rate=0.01, max_features=3, n_estimators=450,
                           random_state=37, subsample=0.5)


# predict using the gradient boosted tree models with the best estimator
y_pred = gbc.predict(X_test)
# calculate precision, recall, f-score, accuracy, ROC-AUC scores, and confusion matrix 
metric_gbc, cm_gbc=measure_error(y_test, y_pred, 'gbc')


# fit AdaBoost model  
ABC = AdaBoostClassifier(DecisionTreeClassifier(max_depth=dt.tree_.max_depth))
param_grid = {'n_estimators': range(50,500,50),
              'learning_rate': [0.01, 0.001]}
GV_ABC = GridSearchCV(ABC,
                      param_grid=param_grid, 
                      scoring='accuracy',
                      n_jobs=-1)
# the grid search with default 5-fold cross validation
GV_ABC = GV_ABC.fit(X_train, y_train)


# The best model
abc=GV_ABC.best_estimator_
abc

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=15),
                   learning_rate=0.01, n_estimators=150)


# predict using the AdaBoost model with the best estimator
y_pred = abc.predict(X_test)
# calculate precision, recall, f-score, accuracy, ROC-AUC scores, and confusion matrix 
metric_abc, cm_abc=measure_error(y_test, y_pred, 'abc')


# fit the combined model with voting classification
estimators = [('lr',lr),('l1',lr_l1),('l2', lr_l2),('knn',knn),('dt',dt),
              ('rf',rf),('ef',ef),('gbc', gbc),('abc',abc)]
# the model using an additional hold-out data set and/or with cross validation
vc = VotingClassifier(estimators, voting='soft')
vc = vc.fit(X_train, y_train)
# predict using the dicision tree model with the best estimator
y_pred = vc.predict(X_test)
# calculate precision, recall, f-score, accuracy, ROC-AUC scores, and confusion matrix 
metric_vc, cm_vc=measure_error(y_test, y_pred, 'vc')


# draw the confusion matrix for each model.
methodnames=['lr','l1','l2','knn','svc','dt','rf','ef','gbc','abc','vc']
titles=['Standard logistic regression','L1 regularized logistic regression','L2 regularized logistic regression',
       'K Nearest Neighbors','Support vector machines','Decision trees','Random forest','Extra random forest',
       'Gradient boosted tree','AdaBoost','Voting classifier']
cm_all=[cm_lrs['lr'],cm_lrs['l1'],cm_lrs['l2'],cm_knn,cm_svc,cm_dt,cm_rf,cm_ef,cm_gbc,cm_abc,cm_vc]

sns.set_context("paper", font_scale=2)
sns.set_style('white')
fig,axes=plt.subplots(3,4,sharex=True, sharey=True, figsize=(20, 15))
axes = axes.flatten()
cbar_ax = fig.add_axes([.90,.3,.01,.4])
# draw the confusion matrix for models.
for idx,name in enumerate(methodnames):  
    sns.heatmap(cm_all[idx], annot=True, fmt='d', vmin=0, vmax=300, cmap="RdPu", cbar_ax = cbar_ax, ax=axes[idx])\
    .set(title=titles[idx]);  
axes[11].set_axis_off()
sns.set(font_scale=4)
fig.tight_layout(rect=[0, 0, .9, 1])
plt.show()


metrics_all=pd.DataFrame()
metrics_all=pd.concat([metrics_lrs, metric_knn,metric_svc,metric_dt, metric_rf,metric_ef,
                       metric_gbc,metric_abc,metric_vc],axis=1)
metrics_all


sns.set_context("paper", font_scale=2)
sns.set_style('white')
fig,ax=plt.subplots(figsize=(15,8))
metrics_all.plot.bar(ax=ax)
ax.legend(bbox_to_anchor=(1.0, 1.0),labels=titles)
ax.set(ylabel='values')

[Text(0, 0.5, 'values')]


metrics_mean=pd.DataFrame(metrics_all.mean(axis=0),columns={'values'})
metrics_mean.reset_index(inplace=True)
sortidx=metrics_mean.sort_values(by=['values']).index.tolist()
metric_rank=metrics_mean.sort_values(by=['values'])
fig,ax=plt.subplots(figsize=(10,6))
metric_rank.plot.barh(ax=ax,legend=False)
ax.set_yticklabels([titles[i] for i in sortidx]);
ax.set_xlabel('mean value')

Text(0.5, 0, 'mean value')

	T	Po	P	Pa	U	DD	Ff	ff10	ff3	N	...	Cm	Ch	VV	Td	RRR	tR	E	Tg	E'	sss
Local time in Cologne / Bonn (airport)
01.10.2021 00:00	10.2	758.4	767.5	NaN	77	Wind blowing from the south-east	3	NaN	NaN	90 or more, but not 100%	...	No Altocumulus, Altostratus or Nimbostratus.	Cirrus fibratus, sometimes uncinus, not progre...	60.0	6.3	NaN	NaN	NaN	NaN	NaN	NaN
01.10.2021 01:00	9.5	758.2	767.3	NaN	81	Wind blowing from the south-east	3	NaN	NaN	90 or more, but not 100%	...	No Altocumulus, Altostratus or Nimbostratus.	Cirrus fibratus, sometimes uncinus, not progre...	60.0	6.4	No precipitation	6.0	Dry (without cracks, no appreciable amount of ...	NaN	NaN	NaN
01.10.2021 02:00	9.8	757.7	766.8	NaN	77	Wind blowing from the south-east	3	NaN	NaN	90 or more, but not 100%	...	Altocumulus translucidus at a single level.	Cirrus fibratus, sometimes uncinus, not progre...	60.0	5.9	NaN	NaN	NaN	NaN	NaN	NaN

	T	Po	P	U	DD	Ff	WW	Td
Local time in Cologne / Bonn (airport)
01.10.2021 00:00	10.2	758.4	767.5	77	10	3	2	6.3
01.10.2021 01:00	9.5	758.2	767.3	81	10	3	2	6.4
01.10.2021 02:00	9.8	757.7	766.8	77	10	3	2	5.9
01.10.2021 03:00	9.6	757.2	766.3	76	3	3	2	5.6
01.10.2021 04:00	8.8	756.9	766.0	81	10	4	2	5.7

	count	mean	std	min	25%	50%	75%	max
T	2201.0	7.38	4.55	-7.6	4.2	7.4	10.2	21.6
DD	2201.0	8.69	4.65	0.0	3.0	10.0	12.0	16.0
Ff	2201.0	3.15	1.82	0.0	2.0	3.0	4.0	13.0
Po	2201.0	753.50	8.59	730.0	746.5	755.3	760.4	771.2
P	2201.0	762.64	8.69	739.0	755.5	764.4	769.6	780.6
Td	2201.0	5.01	3.94	-9.2	2.3	5.5	7.7	15.7
U	2201.0	85.94	11.02	46.0	79.0	89.0	95.0	100.0

	T	DD	Ff	Po	P	Td	U
Local time in Cologne / Bonn (airport)
01.10.2021 00:00	0.609589	0.6250	0.230769	0.689320	0.685096	0.622490	0.574074
01.10.2021 01:00	0.585616	0.6250	0.230769	0.684466	0.680288	0.626506	0.648148
01.10.2021 02:00	0.595890	0.6250	0.230769	0.672330	0.668269	0.606426	0.574074
01.10.2021 03:00	0.589041	0.1875	0.230769	0.660194	0.656250	0.594378	0.555556
01.10.2021 04:00	0.561644	0.6250	0.307692	0.652913	0.649038	0.598394	0.648148

	feature1	feature2	correlation	abs_correlation
0	T	DD	0.286436	0.286436
1	T	Ff	0.285307	0.285307
2	T	Po	0.023899	0.023899
3	T	P	0.006035	0.006035
4	T	Td	0.888159	0.888159

Supervised learning: classifying weather conditions of Cologne Bonn airport¶

Table of Contents¶

1 Introduction ¶

1.1 Main objective ¶

1.2 Outlines ¶

2 Data ¶

2.1 Data description ¶

2.2 Data cleaning ¶

2.3 Feature engineering ¶

2.4 A summary about data ¶

3 Classifier models ¶

3.1 Logistic regression ¶

3.2 K nearest neighbors ¶

3.3 Support vector machines ¶

3.4 Decision trees ¶

3.5 Bagging and random forest ¶

3.6 Boosting and stacking ¶

3.7 Comparison of models ¶

4 Summary ¶

5 Suggestions for next steps ¶

attributes	data type	Description
T	float64	Air temperature (degrees Celsius) at 2 metre height above the earth's surface
Po	float64	Atmospheric pressure at weather station level (millimeters of mercury)
P	float64	Atmospheric pressure reduced to mean sea level (millimeters of mercury)
U	int64	Relative humidity (%) at a height of 2 metres above the earth's surface
DD	int64	Mean wind direction (compass points) at a height of 10-12 metres above the earth’s surface over the 10-minute period immediately preceding the observation
Ff	int64	Mean wind speed at a height of 10-12 metres above the earth’s surface over the 10-minute period immediately preceding the observation (meters per second)
WW	int64	Present weather reported from a weather station
Td	float64	Dewpoint temperature at a height of 2 metres above the earth's surface (degrees Celsius)

	Predicted Positive	Predicted Negative
Actual Positive	True Positive (TP)	False Negative (FN)
Active Negative	False Positive (FP)	True Negative (TN)

	lr				l1				l2
	0	1	2	3	0	1	2	3	0	1	2	3
0	-0.997741	0.337767	0.205985	-1.716821	-42.645138	66.483707	-34.447509	0.0	-30.218256	65.515898	-31.563158	-0.034998
1	-0.508546	-1.940007	2.101315	-0.822119	-0.469926	-2.129128	2.228721	0.0	-0.482717	-2.126515	2.227073	-0.036842
2	0.811549	-1.256073	0.059886	-0.573268	0.174591	-0.423828	0.255464	0.0	0.356009	-0.427506	0.254186	-0.016516
3	0.965388	1.449508	-1.987431	-0.604000	2.230013	2.765854	-9.291913	0.0	1.901672	4.269769	-16.896386	-0.038777
4	0.980607	1.432702	-1.990057	-0.580130	0.000000	0.366573	5.159344	0.0	0.291086	-1.130084	12.742367	-0.038632
5	-1.951430	-0.357339	2.247780	-1.770866	33.970229	-57.882649	33.193651	0.0	23.079140	-57.068313	30.828894	-0.038936
6	-3.198248	0.160893	2.261183	-0.867255	-18.681391	23.552731	-9.224993	0.0	-14.204678	23.228033	-8.299328	-0.050509

	lr	l1	l2	knn	svc	dt	rf	ef	gbc	abc	vc
precision	0.613819	0.695923	0.670834	0.779908	0.710450	0.722592	0.812794	0.808766	0.729576	0.724936	0.801358
recall	0.636914	0.544629	0.597579	0.788200	0.703480	0.727685	0.816944	0.812405	0.732224	0.729198	0.801815
fscore	0.599166	0.576584	0.599975	0.782353	0.669538	0.724366	0.806125	0.799337	0.712232	0.726819	0.789509
accuracy	0.636914	0.544629	0.597579	0.788200	0.703480	0.727685	0.816944	0.812405	0.732224	0.729198	0.801815
auc	0.651573	0.664712	0.670080	0.816684	0.714951	0.768978	0.830747	0.822602	0.746242	0.774462	0.814206

	Predicted Class 1	Predicted Class 2	Predicted Class 3
Actual Class 1	TP1
Actual Class 2		TP2
Actual Class 3			TP3