import os, time, sys
import csv
import math, random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py

from skimage.morphology import disk
from skimage.util import img_as_ubyte
from skimage.filters import rank
from skimage import io, color, exposure, transform

from importlib import reload

import fidle

# Init Fidle environment
run_id, run_dir, datasets_dir = fidle.init('K3GTSRB1')

Version              : 2.3.2
Run id               : K3GTSRB1
Run dir              : ./run/K3GTSRB1
Datasets dir         : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle
Start time           : 22/12/24 21:22:22
Hostname             : r3i5n3 (Linux)
Tensorflow log level : Info + Warning + Error  (=0)
Update keras cache   : False
Update torch cache   : False
Save figs            : ./run/K3GTSRB1/figs (True)
numpy                : 2.1.2
sklearn              : 1.5.2
yaml                 : 6.0.2
skimage              : 0.24.0
matplotlib           : 3.9.2
pandas               : 2.2.3

# ---- For smart tests :
#
scale      = 0.2
output_dir = './data' 

# ---- For a Full dataset generation :
#
# scale      = 1
# output_dir = f'{datasets_dir}/GTSRB/enhanced'

# ---- Verbosity
#
progress_verbosity = 2

fidle.override('scale', 'output_dir', 'progress_verbosity')

** Overrided parameters : **
scale                : 1
output_dir           : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle/GTSRB/enhanced

df = pd.read_csv(f'{datasets_dir}/GTSRB/origine/Test.csv', header=0)
display(df.head(10))

def  read_csv_dataset(csv_file): 
    '''
    Reads traffic sign data from German Traffic Sign Recognition Benchmark dataset.
    Arguments:  
        csv filename :  Description file, Example /data/GTSRB/Train.csv
    Returns:
        x,y          :  np array of images, np array of corresponding labels
    '''

    path = os.path.dirname(csv_file)
    name = os.path.basename(csv_file)

    # ---- Read csv file
    #
    df = pd.read_csv(csv_file, header=0)
    
    # ---- Get filenames and ClassIds
    #
    filenames = df['Path'].to_list()
    y         = df['ClassId'].to_list()
    x         = []
    
    # ---- Read images
    #
    for filename in filenames:
        image=io.imread(f'{path}/{filename}')
        x.append(image)
        fidle.utils.update_progress(name,len(x),len(filenames), verbosity=progress_verbosity)
    
    # ---- Return
    #
    return np.array(x,dtype=object),np.array(y)

chrono=fidle.Chrono()

chrono.start()

# ---- Read datasets

(x_train,y_train) = read_csv_dataset(f'{datasets_dir}/GTSRB/origine/Train.csv')
(x_test ,y_test)  = read_csv_dataset(f'{datasets_dir}/GTSRB/origine/Test.csv')
(x_meta ,y_meta)  = read_csv_dataset(f'{datasets_dir}/GTSRB/origine/Meta.csv')
    
# ---- Shuffle train set

x_train, y_train = fidle.utils.shuffle_np_dataset(x_train, y_train)

# ---- Sort Meta

combined = list(zip(x_meta,y_meta))
combined.sort(key=lambda x: x[1])
x_meta,y_meta = zip(*combined)

chrono.show()

Train.csv        [########################################] 100.0% of 39209

Test.csv         [########################################] 100.0% of 12630
Meta.csv         [########################################] 100.0% of 43
Datasets have been shuffled.
Duration :  111.79 seconds

train_size  = []
train_ratio = []
train_lx    = []
train_ly    = []

test_size   = []
test_ratio  = []
test_lx     = []
test_ly     = []

for image in x_train:
    (lx,ly,lz) = image.shape
    train_size.append(lx*ly/1024)
    train_ratio.append(lx/ly)
    train_lx.append(lx)
    train_ly.append(ly)

for image in x_test:
    (lx,ly,lz) = image.shape
    test_size.append(lx*ly/1024)
    test_ratio.append(lx/ly)
    test_lx.append(lx)
    test_ly.append(ly)

figsize=(10,4)
# ------ Global stuff
print("x_train shape : ",x_train.shape)
print("y_train shape : ",y_train.shape)
print("x_test  shape : ",x_test.shape)
print("y_test  shape : ",y_test.shape)

# ------ Statistics / sizes
plt.figure(figsize=figsize)
plt.hist([train_size,test_size], bins=100)
plt.gca().set(title='Sizes in Kpixels - Train=[{:5.2f}, {:5.2f}]'.format(min(train_size),max(train_size)), 
              ylabel='Population', xlim=[0,30])
plt.legend(['Train','Test'])
fidle.scrawler.save_fig('01-stats-sizes')
plt.show()

# ------ Statistics / ratio lx/ly
plt.figure(figsize=figsize)
plt.hist([train_ratio,test_ratio], bins=100)
plt.gca().set(title='Ratio lx/ly - Train=[{:5.2f}, {:5.2f}]'.format(min(train_ratio),max(train_ratio)), 
              ylabel='Population', xlim=[0.8,1.2])
plt.legend(['Train','Test'])
fidle.scrawler.save_fig('02-stats-ratios')
plt.show()

# ------ Statistics / lx
plt.figure(figsize=figsize)
plt.hist([train_lx,test_lx], bins=100)
plt.gca().set(title='Images lx - Train=[{:5.2f}, {:5.2f}]'.format(min(train_lx),max(train_lx)), 
              ylabel='Population', xlim=[20,150])
plt.legend(['Train','Test'])
fidle.scrawler.save_fig('03-stats-lx')
plt.show()

# ------ Statistics / ly
plt.figure(figsize=figsize)
plt.hist([train_ly,test_ly], bins=100)
plt.gca().set(title='Images ly - Train=[{:5.2f}, {:5.2f}]'.format(min(train_ly),max(train_ly)), 
              ylabel='Population', xlim=[20,150])
plt.legend(['Train','Test'])
fidle.scrawler.save_fig('04-stats-ly')
plt.show()

# ------ Statistics / classId
plt.figure(figsize=figsize)
plt.hist([y_train,y_test], bins=43)
plt.gca().set(title='ClassesId', ylabel='Population', xlim=[0,43])
plt.legend(['Train','Test'])
fidle.scrawler.save_fig('05-stats-classes')
plt.show()

x_train shape :  (39209,)
y_train shape :  (39209,)
x_test  shape :  (12630,)
y_test  shape :  (12630,)

fidle.scrawler.images( x_meta,y_meta, range(43), columns=8, x_size=1.4, y_size=1.4, 
                       colorbar=False, y_pred=None, cm='binary', save_as='06-meta-signs')

# ---- Get and show few images

samples = [ random.randint(0,len(x_train)-1) for i in range(32)]
fidle.scrawler.images( x_train,y_train, samples, columns=8, x_size=1.5, y_size=1.5, 
                       colorbar=False, y_pred=None, cm='binary', save_as='07-real-signs')

def images_enhancement(images, width=25, height=25, proc='RGB'):
    '''
    Resize and convert images - doesn't change originals.
    input images must be RGBA or RGB.
    Note : all outputs are fixed size numpy array of float32
    args:
        images :         images list
        width,height :   new images size (25,25)
        mode :           RGB | RGB-HE | L | L-HE | L-LHE | L-CLAHE
    return:
        numpy array of enhanced images
    '''
    lz={ 'RGB':3, 'RGB-HE':3, 'L':1, 'L-HE':1, 'L-LHE':1, 'L-CLAHE':1}[proc]
    
    out=[]
    for img in images:
        
        # ---- if RGBA, convert to RGB
        if img.shape[2]==4:
            img=color.rgba2rgb(img)
            
        # ---- Resize
        img = transform.resize(img, (width,height))

        # ---- RGB / Histogram Equalization
        if proc=='RGB-HE':
            hsv = color.rgb2hsv(img.reshape(width,height,3))
            hsv[:, :, 2] = exposure.equalize_hist(hsv[:, :, 2])
            img = color.hsv2rgb(hsv)
        
        # ---- Grayscale
        if proc=='L':
            img=color.rgb2gray(img)
            
        # ---- Grayscale / Histogram Equalization
        if proc=='L-HE':
            img=color.rgb2gray(img)
            img=exposure.equalize_hist(img)
            
        # ---- Grayscale / Local Histogram Equalization
        if proc=='L-LHE':        
            img=color.rgb2gray(img)
            img = img_as_ubyte(img)
            img=rank.equalize(img, disk(10))/255.
        
        # ---- Grayscale / Contrast Limited Adaptive Histogram Equalization (CLAHE)
        if proc=='L-CLAHE':
            img=color.rgb2gray(img)
            img=exposure.equalize_adapthist(img)
            
        # ---- Add image in list of list
        out.append(img)
        fidle.utils.update_progress('Enhancement: ',len(out),len(images))

    # ---- Reshape images
    #     (-1, width,height,1) for L
    #     (-1, width,height,3) for RGB
    #
    out = np.array(out,dtype='float32')
    out = out.reshape(-1,width,height,lz)
    return out

i=random.randint(0,len(x_train)-16)
x_samples = x_train[i:i+16]
y_samples = y_train[i:i+16]

datasets  = {}

datasets['RGB']      = images_enhancement( x_samples, width=25, height=25, proc='RGB'  )
datasets['RGB-HE']   = images_enhancement( x_samples, width=25, height=25, proc='RGB-HE'  )
datasets['L']        = images_enhancement( x_samples, width=25, height=25, proc='L'  )
datasets['L-HE']     = images_enhancement( x_samples, width=25, height=25, proc='L-HE'  )
datasets['L-LHE']    = images_enhancement( x_samples, width=25, height=25, proc='L-LHE'  )
datasets['L-CLAHE']  = images_enhancement( x_samples, width=25, height=25, proc='L-CLAHE'  )

fidle.utils.subtitle('EXPECTED')
x_expected=[ x_meta[i] for i in y_samples]
fidle.scrawler.images(x_expected, y_samples, range(12), columns=12, x_size=1, y_size=1,
                colorbar=False, y_pred=None, cm='binary', save_as='08-expected')

fidle.utils.subtitle('ORIGINAL')
fidle.scrawler.images(x_samples,  y_samples, range(12), columns=12, x_size=1, y_size=1, 
                colorbar=False, y_pred=None, cm='binary', save_as='09-original')

fidle.utils.subtitle('ENHANCED')
n=10
for k,d in datasets.items():
    print("dataset : {}  min,max=[{:.3f},{:.3f}]  shape={}".format(k,d.min(),d.max(), d.shape))
    fidle.scrawler.images(d, y_samples, range(12), columns=12, x_size=1, y_size=1, 
                    colorbar=False, y_pred=None, cm='binary', save_as=f'{n}-enhanced-{k}')
    n+=1

Enhancement:     [################] 100.0% of 16
Enhancement:     [################] 100.0% of 16
Enhancement:     [################] 100.0% of 16
Enhancement:     [################] 100.0% of 16
Enhancement:     [################] 100.0% of 16
Enhancement:     [################] 100.0% of 16

dataset : RGB  min,max=[0.023,1.000]  shape=(16, 25, 25, 3)

dataset : RGB-HE  min,max=[0.001,1.000]  shape=(16, 25, 25, 3)

dataset : L  min,max=[0.025,1.000]  shape=(16, 25, 25, 1)

dataset : L-HE  min,max=[0.002,1.000]  shape=(16, 25, 25, 1)

dataset : L-LHE  min,max=[0.000,1.000]  shape=(16, 25, 25, 1)

dataset : L-CLAHE  min,max=[0.000,1.000]  shape=(16, 25, 25, 1)

def save_h5_dataset(x_train, y_train, x_test, y_test, x_meta,y_meta, filename):
        
    # ---- Create h5 file
    with h5py.File(filename, "w") as f:
        f.create_dataset("x_train", data=x_train)
        f.create_dataset("y_train", data=y_train)
        f.create_dataset("x_test",  data=x_test)
        f.create_dataset("y_test",  data=y_test)
        f.create_dataset("x_meta",  data=x_meta)
        f.create_dataset("y_meta",  data=y_meta)
        
    # ---- done
    size=os.path.getsize(filename)/(1024*1024)
    print('Dataset : {:24s}  shape : {:22s} size : {:6.1f} Mo   (saved)'.format(filename, str(x_train.shape),size))

# ---- Size and processings
#
all_size= [24, 48]
all_proc=['RGB', 'RGB-HE', 'L', 'L-LHE']

# ---- Do it
#
chrono.start()

n_train = int( len(x_train)*scale )
n_test  = int( len(x_test)*scale )

fidle.utils.subtitle('Parameters :')
print(f'Scale is : {scale}')
print(f'x_train length is : {n_train}')
print(f'x_test  length is : {n_test}')
print(f'output dir is     : {output_dir}\n')

fidle.utils.subtitle('Running...')

fidle.utils.mkdir(output_dir)

for s in all_size:
    for m in all_proc:
        # ---- A nice dataset name
        filename = f'{output_dir}/set-{s}x{s}-{m}.h5'
        fidle.utils.subtitle(f'Dataset : {filename}')
        
        # ---- Enhancement
        #      Note : x_train is a numpy array of python objects (images with <> sizes)
        #             but images_enhancement() return a real array of float64 numpy (images with same size)
        #             so, we can save it in nice h5 files
        #
        x_train_new = images_enhancement( x_train[:n_train], width=s, height=s, proc=m )
        x_test_new  = images_enhancement( x_test[:n_test],   width=s, height=s, proc=m )
        x_meta_new  = images_enhancement( x_meta,            width=s, height=s, proc='RGB' )
        
        # ---- Save
        save_h5_dataset( x_train_new, y_train[:n_train], x_test_new, y_test[:n_test], x_meta_new,y_meta, filename)

x_train_new,x_test_new=0,0

print('\nDone.')
chrono.show()

Scale is : 1
x_train length is : 39209
x_test  length is : 12630
output dir is     : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle/GTSRB/enhanced

Enhancement:     [########################################] 100.0% of 39209

Enhancement:     [########################################] 100.0% of 12630
Enhancement:     [########################################] 100.0% of 43

Dataset : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle/GTSRB/enhanced/set-24x24-RGB.h5  shape : (39209, 24, 24, 3)     size :  342.4 Mo   (saved)

Enhancement:     [########################################] 100.0% of 39209

Enhancement:     [########################################] 100.0% of 12630
Enhancement:     [########################################] 100.0% of 43

Dataset : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle/GTSRB/enhanced/set-24x24-RGB-HE.h5  shape : (39209, 24, 24, 3)     size :  342.4 Mo   (saved)

Enhancement:     [########################################] 100.0% of 39209

Enhancement:     [########################################] 100.0% of 12630
Enhancement:     [########################################] 100.0% of 43
Dataset : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle/GTSRB/enhanced/set-24x24-L.h5  shape : (39209, 24, 24, 1)     size :  114.6 Mo   (saved)

Enhancement:     [########################################] 100.0% of 39209

chrono.start()

dataset='set-24x24-L'
samples=range(24)

with  h5py.File(f'{output_dir}/{dataset}.h5','r') as f:
    x_tmp = f['x_train'][:]
    y_tmp = f['y_train'][:]
    print("dataset loaded from h5 file.")

fidle.scrawler.images(x_tmp,y_tmp, samples, columns=8, x_size=1.5, y_size=1.5, 
                colorbar=False, y_pred=None, cm='binary', save_as='16-enhanced_images')
x_tmp,y_tmp=0,0

chrono.show()

dataset loaded from h5 file.

Duration :  3.97 seconds

fidle.end()

	Width	Height	Roi.X1	Roi.Y1	Roi.X2	Roi.Y2	ClassId	Path
0	53	54	6	5	48	49	16	Test/00000.png
1	42	45	5	5	36	40	1	Test/00001.png
2	48	52	6	6	43	47	38	Test/00002.png
3	27	29	5	5	22	24	33	Test/00003.png
4	60	57	5	5	55	52	11	Test/00004.png
5	52	56	5	5	47	51	38	Test/00005.png
6	147	130	12	12	135	119	18	Test/00006.png
7	32	33	5	5	26	28	12	Test/00007.png
8	45	50	6	5	40	45	25	Test/00008.png
9	81	86	7	7	74	79	35	Test/00009.png

[K3GTSRB1] - Dataset analysis and preparation¶

Objectives :¶

What we're going to do :¶

Step 1 - Import and init¶

Step 2 - Parameters¶

Step 3 - Read the dataset¶

3.1 - Understanding the dataset¶

3.2 - Usefull functions¶

3.2 - Read the data¶

Step 4 - Few statistics about train dataset¶

4.1 - Do statistics¶

4.2 - Show statistics¶

Step 5 - List of classes¶

Step 6 - What does it really look like¶

Step 7 - dataset cooking...¶

7.1 - Enhancement cooking¶

7.2 - To get an idea of the different recipes¶

7.3 - Cook and save¶

Step 8 - Reload data to be sure ;-)¶