Real Data

Outline: Using a Real Dataset

Analyze Data

Lets see how we can quickly train a new model on a new dataset by using a DataSet. Let’s first look at our new Data, this example is Welding_Perceptron_data.xlsx.

import pandas as pd

df = pd.read_excel(r"_dlcourse/code/Linear Models/Welding_Perceptron_data.xlsx") #You could also just open this file in Excel 

pd.set_option('display.colheader_justify', 'center') #pretty print option
pd.set_option('display.precision',3) #pretty print option

print(f"Total number of samples: {len(df)}")
print("Top 5 rows:")
print(df.head())

Total number of samples: 480
Top 5 rows:
   TrialNo  Current  Angle  Speed  Time   Height
0     1       160      0    0.005  5.003    39  
1     1       160      0    0.005  5.502    39  
2     1       160      0    0.005  5.969    39  
3     1       160      0    0.005  6.453    38  
4     1       160      0    0.005  6.920    40

As you can see, we have 6 columns of data. For this experiment, I want to coorelate the Current, Angle, Speed, and Time to the Height.

Create a Dataset

Let’s first meet the requirments for creating a custom pytorch Dataset, there are three requirements: Inhereit the Dataset class, create a len function, and create a getitem function. Lets also give it a name.

from torch.utils.data import Dataset

class WeldingRegression(Dataset):
    def __init__(self):
        super().__init__() 
        
    
    def __len__(self):
      
    
    def __getitem__(self, index):

Next, let’s figure out how we are going to read the relevant data in python.

import pandas as pd
from torch.utils.data import Dataset

class WeldingRegression(Dataset):
    def __init__(self):
        super().__init__()

        df_data = pd.read_excel("Welding_Perceptron_data.xlsx") 
        input_names =  ['Current','Angle','Speed','Time']
        target_names = ['Height']
        df_inputs = df_data[input_names]
        df_targets = df_data[target_names]

Now that we have the relevant data separated out, we can convert them to tensors

import pandas as pd
import torch 
from torch.utils.data import Dataset

class WeldingRegression(Dataset):
    def __init__(self):
        super().__init__()

        df_data = pd.read_excel("Welding_Perceptron_data.xlsx")
        input_names =  ['Current','Angle','Speed','Time']
        target_names = ['Height']
        df_inputs = df_data[input_names]
        df_targets = df_data[target_names]

        ar_inputs = df_inputs.values
        ar_targets = df_targets.values
        self.tensor_inputs = torch.tensor(ar_inputs).float()
        self.tensor_targets = torch.tensor(ar_targets).float()

We have now prepared the data for our data loader, the final thing is to include the len function needs to return the total number of samples (that will be going across axis 0). and the getitem

def __len__(self):
    return len(self._tensor_targets)

def __getitem__(self, index):
    inputs = self._tensor_inputs[index]
    targets = self._tensor_targets[index]
    return inputs,targets

Finally, we want the ability to normalize our data, so we create a normalize function, and apply it to our tensors.

if normalize:
    self.normalize(self._tensor_inputs)
    self.normalize(self._tensor_targets)

def normalize(self,tensor:torch.Tensor) -> None:
      tensor[:] = (tensor-tensor.amin(0))/(tensor.amax(0)-tensor.amin(0))

Clean Up

def normalize_tensor(tensor:torch.Tensor) -> None:
    tensor[:] = (tensor-tensor.amin(0))/(tensor.amax(0)-tensor.amin(0))

class WeldingDataset(Dataset):
    def __init__(self,
                 data_file:str,
                 input_names:List,
                 target_names:List,
                 normalize:bool=False,
                 ):
        super().__init__()

        df_data:pd.DataFrame = pd.read_excel(data_file) 
        df_inputs = df_data[input_names]
        df_targets = df_data[target_names]
        self._tensor_inputs = torch.tensor(df_inputs.values).float()
        self._tensor_targets = torch.tensor(df_targets.values).view(-1,len(target_names)).float()
        if normalize:
            normalize_tensor(self._tensor_inputs)
            normalize_tensor(self._tensor_targets)
        
    
    def __len__(self):
        return len(self._tensor_targets)
    
    def __getitem__(self, index):
        inputs = self._tensor_inputs[index]
        targets = self._tensor_targets[index]
        return inputs,targets

Use new Data

if __name__ == "__main__":
    seed_everything(1)
    input_names =  ['Current','Angle','Speed','Time']
    target_names = ['Height']
    dataset = WeldingDataset("Welding_Perceptron_data.xlsx",input_names,target_names,False)
    ...
    model = mlpModel(len(input_names),10,len(target_names))