# import os
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

import matplotlib
import matplotlib.pyplot as plt
# import math
import random
# import os
import sys

import fidle

# Init Fidle environment
run_id, run_dir, datasets_dir = fidle.init('LOGR1')

Version              : 2.3.2
Run id               : LOGR1
Run dir              : ./run/LOGR1
Datasets dir         : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle
Start time           : 22/12/24 21:20:42
Hostname             : r3i7n1 (Linux)
Tensorflow log level : Info + Warning + Error  (=0)
Update keras cache   : False
Update torch cache   : False
Save figs            : ./run/LOGR1/figs (True)
numpy                : 2.1.2
sklearn              : 1.5.2
yaml                 : 6.0.2
matplotlib           : 3.9.2
pandas               : 2.2.3

def vector_infos(name,V):
    '''Displaying some information about a vector'''
    with np.printoptions(precision=4, suppress=True):
        print("{:16} : ndim={}  shape={:10}  Mean = {}  Std = {}".format( name,V.ndim, str(V.shape), V.mean(axis=0), V.std(axis=0)))

        
def do_i_have_it(hours_of_work, hours_of_sleep):
    '''Returns the exam result based on work and sleep hours'''
    hours_of_sleep_min = 5
    hours_of_work_min  = 4
    hours_of_game_max  = 3
    # ---- Have to sleep and work
    if hours_of_sleep < hours_of_sleep_min: return 0
    if hours_of_work < hours_of_work_min:   return 0
    # ---- Gameboy is not good for you
    hours_of_game = 24 - 10 - hours_of_sleep - hours_of_work + random.gauss(0,0.4)
    if hours_of_game > hours_of_game_max:   return 0
    # ---- Fine, you got it
    return 1


def make_students_dataset(size, noise):
    '''Fabrique un dataset pour <size> étudiants'''
    x = []
    y = []
    for i in range(size):
        w = random.gauss(5,1)
        s = random.gauss(7,1.5)
        r   = do_i_have_it(w,s)
        x.append([w,s])
        y.append(r)
    return (np.array(x), np.array(y))


def plot_data(x,y, colors=('green','red'), legend=True):
    '''Affiche un dataset'''
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(10,8)
    ax.plot(x[y==1, 0], x[y==1, 1], 'o', color=colors[0], markersize=4, label="y=1 (positive)")
    ax.plot(x[y==0, 0], x[y==0, 1], 'o', color=colors[1], markersize=4, label="y=0 (negative)")
    if legend : ax.legend()
    plt.tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
    plt.xlabel('Hours of work')
    plt.ylabel('Hours of sleep')
    plt.show()


def plot_results(x_test,y_test, y_pred):
    '''Affiche un resultat'''

    precision = metrics.precision_score(y_test, y_pred)
    recall    = metrics.recall_score(y_test, y_pred)

    print("Accuracy = {:5.3f}    Recall = {:5.3f}".format(precision, recall))

    x_pred_positives = x_test[ y_pred == 1 ]     # items prédits    positifs
    x_real_positives = x_test[ y_test == 1 ]     # items réellement positifs
    x_pred_negatives = x_test[ y_pred == 0 ]     # items prédits    négatifs
    x_real_negatives = x_test[ y_test == 0 ]     # items réellement négatifs

    fig, axs = plt.subplots(2, 2)
    fig.subplots_adjust(wspace=.1,hspace=0.2)
    fig.set_size_inches(14,10)
    
    axs[0,0].plot(x_pred_positives[:,0], x_pred_positives[:,1], 'o',color='lightgreen', markersize=10, label="Prédits positifs")
    axs[0,0].plot(x_real_positives[:,0], x_real_positives[:,1], 'o',color='green',      markersize=4,  label="Réels positifs")
    axs[0,0].legend()
    axs[0,0].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
    axs[0,0].set_xlabel('$x_1$')
    axs[0,0].set_ylabel('$x_2$')


    axs[0,1].plot(x_pred_negatives[:,0], x_pred_negatives[:,1], 'o',color='lightsalmon', markersize=10, label="Prédits négatifs")
    axs[0,1].plot(x_real_negatives[:,0], x_real_negatives[:,1], 'o',color='red',        markersize=4,  label="Réels négatifs")
    axs[0,1].legend()
    axs[0,1].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
    axs[0,1].set_xlabel('$x_1$')
    axs[0,1].set_ylabel('$x_2$')
    
    axs[1,0].plot(x_pred_positives[:,0], x_pred_positives[:,1], 'o',color='lightgreen', markersize=10, label="Prédits positifs")
    axs[1,0].plot(x_pred_negatives[:,0], x_pred_negatives[:,1], 'o',color='lightsalmon', markersize=10, label="Prédits négatifs")
    axs[1,0].plot(x_real_positives[:,0], x_real_positives[:,1], 'o',color='green',      markersize=4,  label="Réels positifs")
    axs[1,0].plot(x_real_negatives[:,0], x_real_negatives[:,1], 'o',color='red',        markersize=4,  label="Réels négatifs")
    axs[1,0].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
    axs[1,0].set_xlabel('$x_1$')
    axs[1,0].set_ylabel('$x_2$')

    axs[1,1].pie([precision,1-precision], explode=[0,0.1], labels=["","Errors"], 
                 autopct='%1.1f%%', shadow=False, startangle=70, colors=["lightsteelblue","coral"])
    axs[1,1].axis('equal')

    plt.show()

data_size      = 1000       # Number of observations
data_cols      = 2          # observation size
data_noise     = 0.2
random_seed    = 123

x_data,y_data=make_students_dataset(data_size,data_noise)

plot_data(x_data, y_data)
vector_infos('Dataset X',x_data)
vector_infos('Dataset y',y_data)

Dataset X        : ndim=2  shape=(1000, 2)   Mean = [5.0222 7.001 ]  Std = [0.9664 1.4488]
Dataset y        : ndim=1  shape=(1000,)     Mean = 0.668  Std = 0.47093099282166595

# ---- Split data

n = int(data_size * 0.8)
x_train = x_data[:n]
y_train = y_data[:n]
x_test  = x_data[n:]
y_test  = y_data[n:]

# ---- Normalization

mean = np.mean(x_train, axis=0)
std  = np.std(x_train, axis=0)

x_train = (x_train-mean)/std
x_test  = (x_test-mean)/std

# ---- About it

vector_infos('X_train',x_train)
vector_infos('y_train',y_train)
vector_infos('X_test',x_test)
vector_infos('y_test',y_test)

y_train_h = y_train.reshape(-1,) # nécessaire pour la visu.

X_train          : ndim=2  shape=(800, 2)    Mean = [-0. -0.]  Std = [1. 1.]
y_train          : ndim=1  shape=(800,)      Mean = 0.68375  Std = 0.4650117606039658
X_test           : ndim=2  shape=(200, 2)    Mean = [-0.0689 -0.0259]  Std = [1.0037 1.0038]
y_test           : ndim=1  shape=(200,)      Mean = 0.605  Std = 0.4888506929523574

fidle.utils.display_md('**This is what we know :**')
plot_data(x_train, y_train)
fidle.utils.display_md('**This is what we want to classify :**')
plot_data(x_test,  y_test, colors=("gray","gray"), legend=False)

# ---- Create an instance
#      Use SAGA solver (Stochastic Average Gradient descent solver)
#
logreg = LogisticRegression(C=1e5, verbose=0, solver='saga')

# ---- Fit the data.
#
logreg.fit(x_train, y_train)

# ---- Do a prediction
#
y_pred = logreg.predict(x_test)

plot_results(x_test,y_test, y_pred)

Accuracy = 0.929    Recall = 0.967

x_train_enhanced = np.c_[x_train,
                         x_train[:, 0] ** 2,
                         x_train[:, 1] ** 2,
                         x_train[:, 0] ** 3,
                         x_train[:, 1] ** 3]
x_test_enhanced = np.c_[x_test,
                        x_test[:, 0] ** 2,
                        x_test[:, 1] ** 2,
                        x_test[:, 0] ** 3,
                        x_test[:, 1] ** 3]

# ---- Create an instance
#      Use SAGA solver (Stochastic Average Gradient descent solver)
#
logreg = LogisticRegression(C=1e5, verbose=0, solver='saga', max_iter=5000, n_jobs=-1)

# ---- Fit the data.
#
logreg.fit(x_train_enhanced, y_train)

# ---- Do a prediction
#
y_pred = logreg.predict(x_test_enhanced)

plot_results(x_test_enhanced, y_test, y_pred)

Accuracy = 0.902    Recall = 0.992

fidle.end()

[LOGR1] - Logistic regression¶

Objectives :¶

What we're going to do :¶

Step 1 - Import and init¶

1.1 - Usefull stuff (hidden)¶

1.2 - Parameters¶

Step 2 - Data preparation¶

2.1 - Get some data¶

2.2 - Show it¶

2.3 - Preparation of data¶

2.4 - Have a look¶

Step 3 - Logistic model #1¶

3.1 - Here is the classifier¶

3.3 - Evaluation¶

Step 4 - Bending the space to a model #2 ;-)¶

4.1 - Extend data¶

4.2 - Run the classifier¶

4.3 - Evaluation¶