import os
os.environ['KERAS_BACKEND'] = 'torch'

import keras
import keras.datasets.imdb as imdb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import fidle

# Init Fidle environment
run_id, run_dir, datasets_dir = fidle.init('K3IMDB1')

Version              : 2.3.2
Run id               : K3IMDB1
Run dir              : ./run/K3IMDB1
Datasets dir         : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle
Start time           : 22/12/24 21:22:34
Hostname             : r3i6n0 (Linux)
Tensorflow log level : Info + Warning + Error  (=0)
Update keras cache   : False
Update torch cache   : False
Save figs            : ./run/K3IMDB1/figs (True)
keras                : 3.7.0
numpy                : 2.1.2
sklearn              : 1.5.2
yaml                 : 6.0.2
matplotlib           : 3.9.2
pandas               : 2.2.3
torch                : 2.5.0

vocab_size           = 5000
hide_most_frequently = 0

epochs               = 10
batch_size           = 512
fit_verbosity        = 1

fidle.override('vocab_size', 'hide_most_frequently', 'batch_size', 'epochs', 'fit_verbosity')

** Overrided parameters : **
fit_verbosity        : 2

sentence = "I've never seen a movie like this before"

dictionary  = {"a":0, "before":1, "fantastic":2, "i've":3, "is":4, "like":5, "movie":6, "never":7, "seen":8, "this":9}

sentence_words = sentence.lower().split()

sentence_vect  = [ dictionary[w] for w in sentence_words ]

print('Words sentence are         : ', sentence_words)
print('Our vectorized sentence is : ', sentence_vect)

Words sentence are         :  ["i've", 'never', 'seen', 'a', 'movie', 'like', 'this', 'before']
Our vectorized sentence is :  [3, 7, 8, 0, 6, 5, 9, 1]

# ---- We get a (sentence length x vector size) matrix of zeros
#
onehot = np.zeros( (10,8) )

# ---- We set some 1 for each word
#
for i,w in enumerate(sentence_vect):
    onehot[w,i]=1

# --- Show it
#
print('In a basic way :\n\n', onehot, '\n\nWith a pandas wiew :\n')
data={ f'{sentence_words[i]:.^10}':onehot[:,i] for i,w in enumerate(sentence_vect) }
df=pd.DataFrame(data)
df.index=dictionary.keys()
# --- Pandas Warning 
# 
df.style.format('{:1.0f}').highlight_max(axis=0).set_properties(**{'text-align': 'center'})

In a basic way :

 [[0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]] 

With a pandas wiew :

# ----- Retrieve x,y
#
start_char = 1      # Start of a sequence (padding is 0)
oov_char   = 2      # Out-of-vocabulary
index_from = 3      # First word id

(x_train, y_train), (x_test, y_test) = imdb.load_data( num_words  = vocab_size, 
                                                       skip_top   = hide_most_frequently,
                                                       start_char = start_char, 
                                                       oov_char   = oov_char, 
                                                       index_from = index_from)

# ---- About
#
print("Max(x_train,x_test)  : ", fidle.utils.rmax([x_train,x_test]) )
print("Min(x_train,x_test)  : ", fidle.utils.rmin([x_train,x_test]) )
print("Len(x_train)         : ", len(x_train))
print("Len(x_test)          : ", len(x_test))

Max(x_train,x_test)  :  4999

Min(x_train,x_test)  :  1
Len(x_train)         :  25000
Len(x_test)          :  25000

print('\nReview example (x_train[12]) :\n\n',x_train[12])
print('\nOpinions (y_train) :\n\n',y_train)

Review example (x_train[12]) :

 [1, 13, 119, 954, 189, 1554, 13, 92, 459, 48, 4, 116, 9, 1492, 2291, 42, 726, 4, 1939, 168, 2031, 13, 423, 14, 20, 549, 18, 4, 2, 547, 32, 4, 96, 39, 4, 454, 7, 4, 22, 8, 4, 55, 130, 168, 13, 92, 359, 6, 158, 1511, 2, 42, 6, 1913, 19, 194, 4455, 4121, 6, 114, 8, 72, 21, 465, 2, 304, 4, 51, 9, 14, 20, 44, 155, 8, 6, 226, 162, 616, 651, 51, 9, 14, 20, 44, 10, 10, 14, 218, 4843, 629, 42, 3017, 21, 48, 25, 28, 35, 534, 5, 6, 320, 8, 516, 5, 42, 25, 181, 8, 130, 56, 547, 3571, 5, 1471, 851, 14, 2286]

Opinions (y_train) :

 [1 0 0 ... 0 1 0]

# ---- Retrieve dictionary {word:index}, and encode it in ascii
#
word_index = imdb.get_word_index()

# ---- Shift the dictionary from <index_from>
#
word_index = {w:(i+index_from) for w,i in word_index.items()}

# ---- Add <pad>, <start> and <unknown> tags
#
word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2, '<undef>':3,} )

# ---- Create a reverse dictionary : {index:word}
#
index_word = {index:word for word,index in word_index.items()} 

# ---- About dictionary
#
print('\nDictionary size     : ', len(word_index))
print('\nSmall extract :\n')
for k in range(440,455):print(f'    {k:2d} : {index_word[k]}' )

# ---- Add a nice function to transpose :
#
def dataset2text(review):
    return ' '.join([index_word.get(i, '?') for i in review])

Dictionary size     :  88588

Small extract :

    440 : hope
    441 : entertaining
    442 : she's
    443 : mr
    444 : overall
    445 : evil
    446 : called
    447 : loved
    448 : based
    449 : oh
    450 : several
    451 : fans
    452 : mother
    453 : drama
    454 : beginning

fidle.utils.subtitle('Review example :')
print(x_train[12])
fidle.utils.subtitle('After translation :')
print(dataset2text(x_train[12]))

[1, 13, 119, 954, 189, 1554, 13, 92, 459, 48, 4, 116, 9, 1492, 2291, 42, 726, 4, 1939, 168, 2031, 13, 423, 14, 20, 549, 18, 4, 2, 547, 32, 4, 96, 39, 4, 454, 7, 4, 22, 8, 4, 55, 130, 168, 13, 92, 359, 6, 158, 1511, 2, 42, 6, 1913, 19, 194, 4455, 4121, 6, 114, 8, 72, 21, 465, 2, 304, 4, 51, 9, 14, 20, 44, 155, 8, 6, 226, 162, 616, 651, 51, 9, 14, 20, 44, 10, 10, 14, 218, 4843, 629, 42, 3017, 21, 48, 25, 28, 35, 534, 5, 6, 320, 8, 516, 5, 42, 25, 181, 8, 130, 56, 547, 3571, 5, 1471, 851, 14, 2286]

<start> i love cheesy horror flicks i don't care if the acting is sub par or whether the monsters look corny i liked this movie except for the <unknown> feeling all the way from the beginning of the film to the very end look i don't need a 10 page <unknown> or a sign with big letters explaining a plot to me but dark <unknown> takes the what is this movie about thing to a whole new annoying level what is this movie about br br this isn't exceptionally scary or thrilling but if you have an hour and a half to kill and or you want to end up feeling frustrated and confused rent this winner

sizes=[len(i) for i in x_train]
plt.figure(figsize=(12,4))
plt.hist(sizes, bins=400)
plt.gca().set(title='Distribution of reviews by size - [{:5.2f}, {:5.2f}]'.format(min(sizes),max(sizes)), 
              xlabel='Size', ylabel='Density', xlim=[0,1500])
fidle.scrawler.save_fig('01-stats-sizes')
plt.show()

unk=[ 100*(s.count(oov_char)/len(s)) for s in x_train]
plt.figure(figsize=(12,4))
plt.hist(unk, bins=100)
plt.gca().set(title='Percent of unknown words - [{:5.2f}, {:5.2f}]'.format(min(unk),max(unk)), 
              xlabel='# unknown', ylabel='Density', xlim=[0,30])
fidle.scrawler.save_fig('02-stats-unknown')
plt.show()

def one_hot_encoder(x, vector_size=10000):

    # ---- Set all to 0
    #
    x_encoded = np.zeros((len(x), vector_size))
    
    # ---- For each sentence
    #
    for i,sentence in enumerate(x):
        for word in sentence:
            x_encoded[i, word] = 1.

    return x_encoded

x_train = one_hot_encoder(x_train, vector_size=vocab_size)
x_test  = one_hot_encoder(x_test,  vector_size=vocab_size)

print("To have a look, x_train[12] became :", x_train[12] )

To have a look, x_train[12] became : [0. 1. 1. ... 0. 0. 0.]

model = keras.Sequential(name='My IMDB classifier')

model.add(keras.layers.Input( shape=(vocab_size,) ))
model.add(keras.layers.Dense( 32, activation='relu'))
model.add(keras.layers.Dense( 32, activation='relu'))
model.add(keras.layers.Dense( 1,  activation='sigmoid'))
    
model.compile(optimizer = 'rmsprop',
                  loss      = 'binary_crossentropy',
                  metrics   = ['accuracy'])

model.summary()

Model: "My IMDB classifier"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ dense (Dense)                        │ (None, 32)                  │         160,032 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_1 (Dense)                      │ (None, 32)                  │           1,056 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_2 (Dense)                      │ (None, 1)                   │              33 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘

 Total params: 161,121 (629.38 KB)

 Trainable params: 161,121 (629.38 KB)

 Non-trainable params: 0 (0.00 B)

os.makedirs(f'{run_dir}/models',   mode=0o750, exist_ok=True)
save_dir = f'{run_dir}/models/best_model.keras'

savemodel_callback = keras.callbacks.ModelCheckpoint( filepath=save_dir, monitor='val_accuracy', mode='max', save_best_only=True)

%%time

history = model.fit(x_train,
                    y_train,
                    epochs          = epochs,
                    batch_size      = batch_size,
                    validation_data = (x_test, y_test),
                    verbose         = fit_verbosity,
                    callbacks       = [savemodel_callback])

Epoch 1/10

49/49 - 2s - 43ms/step - accuracy: 0.7986 - loss: 0.4631 - val_accuracy: 0.8628 - val_loss: 0.3470

Epoch 2/10

49/49 - 2s - 36ms/step - accuracy: 0.8958 - loss: 0.2775 - val_accuracy: 0.8630 - val_loss: 0.3312

Epoch 3/10

49/49 - 2s - 36ms/step - accuracy: 0.9076 - loss: 0.2381 - val_accuracy: 0.8827 - val_loss: 0.2914

Epoch 4/10

49/49 - 2s - 36ms/step - accuracy: 0.9188 - loss: 0.2090 - val_accuracy: 0.8774 - val_loss: 0.2993

Epoch 5/10

49/49 - 2s - 36ms/step - accuracy: 0.9234 - loss: 0.1973 - val_accuracy: 0.8806 - val_loss: 0.2995

fidle.scrawler.history(history, save_as='02-history')

model = keras.models.load_model(f'{run_dir}/models/best_model.keras')

# ---- Evaluate
score  = model.evaluate(x_test, y_test, verbose=0)

print('\n\nModel evaluation :\n')
print('    x_test / loss      : {:5.4f}'.format(score[0]))
print('    x_test / accuracy  : {:5.4f}'.format(score[1]))

values=[score[1], 1-score[1]]
fidle.scrawler.donut(values,["Accuracy","Errors"], title="#### Accuracy donut is :", save_as='03-donut')

# ---- Confusion matrix

y_sigmoid = model.predict(x_test, verbose=fit_verbosity)

y_pred = y_sigmoid.copy()
y_pred[ y_sigmoid< 0.5 ] = 0
y_pred[ y_sigmoid>=0.5 ] = 1    

fidle.scrawler.confusion_matrix_txt(y_test,y_pred,labels=range(2))
fidle.scrawler.confusion_matrix(y_test,y_pred,range(2), figsize=(8, 8),normalize=False, save_as='04-confusion-matrix')


Model evaluation :

    x_test / loss      : 0.2914
    x_test / accuracy  : 0.8827

782/782 - 2s - 2ms/step

fidle.end()

	0	1
0	0.84	0.16
1	0.08	0.92

[K3IMDB1] - Sentiment analysis with hot-one encoding¶

Objectives :¶

What we're going to do :¶

Step 1 - Import and init¶

1.1 - Python stuff¶

1.2 - Parameters¶

Step 2 - Understanding hot-one encoding¶

We have a sentence and a dictionary :¶

We encode our sentence as a numerical vector :¶

Next, we one-hot encode our vectorized sentence as a tensor :¶

Step 3 - Retrieve data¶

3.1 - Data structure :¶

3.2 - Load dataset¶

Step 4 - About our dataset¶

4.1 - Sentences encoding¶

4.2 - Load dictionary¶

4.3 - Have a look, for human¶

4.4 - Few statistics¶

Step 5 - Basic approach with "one-hot" vector encoding¶

5.1 - Our one-hot encoder function¶

5.2 - Encoding..¶

Step 6 - Build a nice model¶

Step 7 - Train the model¶

7.1 - Add callback¶

7.2 - Train it¶

Step 8 - Evaluate¶

8.1 - Training history¶

8.2 - Reload and evaluate best model¶

Accuracy donut is :¶

Confusion matrix is :¶

	...i've...	..never...	...seen...	....a.....	..movie...	...like...	...this...	..before..
a	0	0	0	1	0	0	0	0
before	0	0	0	0	0	0	0	1
fantastic	0	0	0	0	0	0	0	0
i've	1	0	0	0	0	0	0	0
is	0	0	0	0	0	0	0	0
like	0	0	0	0	0	1	0	0
movie	0	0	0	0	1	0	0	0
never	0	1	0	0	0	0	0	0
seen	0	0	1	0	0	0	0	0
this	0	0	0	0	0	0	1	0