import os
os.environ['KERAS_BACKEND'] = 'torch'

import keras

import json,re
import numpy as np

import fidle

# Init Fidle environment
run_id, run_dir, datasets_dir = fidle.init('K3IMDB4')

Version              : 2.3.2
Run id               : K3IMDB4
Run dir              : ./run/K3IMDB4
Datasets dir         : /lustre/fswork/projects/rech/mlh/uja62cb/fidle-project/datasets-fidle
Start time           : 22/12/24 21:23:27
Hostname             : r3i6n0 (Linux)
Tensorflow log level : Info + Warning + Error  (=0)
Update keras cache   : False
Update torch cache   : False
Save figs            : ./run/K3IMDB4/figs (True)
keras                : 3.7.0
numpy                : 2.1.2
sklearn              : 1.5.2
yaml                 : 6.0.2
matplotlib           : 3.9.2
pandas               : 2.2.3
torch                : 2.5.0

vocab_size           = 5000
review_len           = 256

saved_models         = './run/K3IMDB2'
dictionaries_dir     = './data'

fidle.override('vocab_size', 'review_len', 'saved_models', 'dictionaries_dir')

model = keras.models.load_model(f'{saved_models}/models/best_model.keras')
print('Model loaded.')

with open(f'{dictionaries_dir}/word_index.json', 'r') as fp:
    word_index = json.load(fp)
    index_word = { i:w      for w,i in word_index.items() }
    print('Dictionaries loaded. ', len(word_index), 'entries' )

Model loaded.
Dictionaries loaded.  88588 entries

embeddings = model.layers[0].get_weights()[0]
print('Shape of embeddings : ',embeddings.shape)

Shape of embeddings :  (5000, 32)

word_embedding = { index_word[i]:embeddings[i] for i in range(vocab_size) }

word_embedding['nice']

array([ 0.21260725,  0.16411522,  0.20545849, -0.14136912, -0.2018573 ,
       -0.18842438,  0.20927402, -0.18139772,  0.13205685,  0.1944659 ,
        0.12543829,  0.13478456,  0.16418412,  0.21914196, -0.21495722,
       -0.17776752,  0.23770906,  0.20715123,  0.19914348,  0.18577246,
       -0.14190526,  0.22035922,  0.19699118,  0.13939948, -0.22374831,
        0.21577328, -0.14003031, -0.19838649,  0.16246769,  0.15905133,
       -0.1445778 ,  0.16018525], dtype=float32)

# Return a l2 distance between 2 words
#
def l2w(w1,w2):
    v1=word_embedding[w1]
    v2=word_embedding[w2]
    return np.linalg.norm(v2-v1)

# Show distance between 2 words 
#
def show_l2(w1,w2):
    print(f'\nL2 between [{w1}] and [{w2}] : ',l2w(w1,w2))

# Displays the 15 closest words to a given word
#
def neighbors(w1):
    v1=word_embedding[w1]
    dd={}
    for i in range(4, 1000):
        w2=index_word[i]
        dd[w2]=l2w(w1,w2)
    dd= {k: v for k, v in sorted(dd.items(), key=lambda item: item[1])}
    print(f'\nNeighbors of [{w1}] : ', list(dd.keys())[1:15])

show_l2('nice', 'pleasant')
show_l2('nice', 'horrible')

neighbors('horrible')
neighbors('great')

L2 between [nice] and [pleasant] :  0.70260125

L2 between [nice] and [horrible] :  4.035369

Neighbors of [horrible] :  ['avoid', 'badly', 'annoying', 'save', 'ridiculous', 'worse', 'terrible', 'dull', 'poor', 'mess', 'predictable', 'fails', 'boring', 'lame']

Neighbors of [great] :  ['definitely', 'brilliant', '9', 'enjoyable', 'enjoyed', 'loved', 'surprised', 'fantastic', 'wonderful', 'masterpiece', 'highly', 'fun', 'amazing', 'superb']

fidle.end()

[K3IMDB4] - Reload embedded vectors¶

Objectives :¶

What we're going to do :¶

Step 1 - Init python stuff¶

1.2 - Parameters¶

Step 2 - Get the embedding vectors !¶

2.1 - Load model and dictionaries¶

2.2 - Retrieve embeddings¶

2.3 - Build a nice dictionary¶

Step 3 - Have a look !¶

Show embedding of a word :¶

Few usefull functions to play with¶

Examples¶