Step 1 - A little cooking with datasets¶
In [1]:
import pandas as pd
import numpy as np
In [2]:
# Get some data
a = np.arange(50).reshape(10,5)
print('Starting data: \n',a)
Starting data: [[ 0 1 2 3 4] [ 5 6 7 8 9] [10 11 12 13 14] [15 16 17 18 19] [20 21 22 23 24] [25 26 27 28 29] [30 31 32 33 34] [35 36 37 38 39] [40 41 42 43 44] [45 46 47 48 49]]
In [3]:
# Create a DataFrame
df_all = pd.DataFrame(a, columns=['A','B','C','D','E'])
print('\nDataFrame :')
display(df_all)
DataFrame :
A | B | C | D | E | |
---|---|---|---|---|---|
0 | 0 | 1 | 2 | 3 | 4 |
1 | 5 | 6 | 7 | 8 | 9 |
2 | 10 | 11 | 12 | 13 | 14 |
3 | 15 | 16 | 17 | 18 | 19 |
4 | 20 | 21 | 22 | 23 | 24 |
5 | 25 | 26 | 27 | 28 | 29 |
6 | 30 | 31 | 32 | 33 | 34 |
7 | 35 | 36 | 37 | 38 | 39 |
8 | 40 | 41 | 42 | 43 | 44 |
9 | 45 | 46 | 47 | 48 | 49 |
In [4]:
# Shuffle data
df_all = df_all.sample(frac=1, axis=0)
print('\nDataFrame randomly shuffled :')
display(df_all)
DataFrame randomly shuffled :
A | B | C | D | E | |
---|---|---|---|---|---|
5 | 25 | 26 | 27 | 28 | 29 |
7 | 35 | 36 | 37 | 38 | 39 |
4 | 20 | 21 | 22 | 23 | 24 |
9 | 45 | 46 | 47 | 48 | 49 |
1 | 5 | 6 | 7 | 8 | 9 |
0 | 0 | 1 | 2 | 3 | 4 |
2 | 10 | 11 | 12 | 13 | 14 |
3 | 15 | 16 | 17 | 18 | 19 |
8 | 40 | 41 | 42 | 43 | 44 |
6 | 30 | 31 | 32 | 33 | 34 |
In [5]:
# Get a train part
df_train = df_all.sample(frac=0.8, axis=0)
print('\nTrain set (80%) :')
display(df_train)
Train set (80%) :
A | B | C | D | E | |
---|---|---|---|---|---|
3 | 15 | 16 | 17 | 18 | 19 |
1 | 5 | 6 | 7 | 8 | 9 |
7 | 35 | 36 | 37 | 38 | 39 |
2 | 10 | 11 | 12 | 13 | 14 |
6 | 30 | 31 | 32 | 33 | 34 |
8 | 40 | 41 | 42 | 43 | 44 |
5 | 25 | 26 | 27 | 28 | 29 |
4 | 20 | 21 | 22 | 23 | 24 |
In [6]:
# Get test set as all - train
df_test = df_all.drop(df_train.index)
print('\nTest set (all - train) :')
display(df_test)
Test set (all - train) :
A | B | C | D | E | |
---|---|---|---|---|---|
9 | 45 | 46 | 47 | 48 | 49 |
0 | 0 | 1 | 2 | 3 | 4 |
In [7]:
x_train = df_train.drop('E', axis=1)
y_train = df_train['E']
x_test = df_test.drop('E', axis=1)
y_test = df_test['E']
display(x_train)
display(y_train)
A | B | C | D | |
---|---|---|---|---|
3 | 15 | 16 | 17 | 18 |
1 | 5 | 6 | 7 | 8 |
7 | 35 | 36 | 37 | 38 |
2 | 10 | 11 | 12 | 13 |
6 | 30 | 31 | 32 | 33 |
8 | 40 | 41 | 42 | 43 |
5 | 25 | 26 | 27 | 28 |
4 | 20 | 21 | 22 | 23 |
3 19 1 9 7 39 2 14 6 34 8 44 5 29 4 24 Name: E, dtype: int64