singularity

PyTorch Badge

Three core components of pytorch

alt text

Deep learning

alt text

Training process

alt text

Tensor ops

alt text

import torch
tensor0d = torch.tensor(1) #1
tensor1d = torch.tensor([1, 2, 3]) #2
tensor2d = torch.tensor([[1, 2],
[3, 4]]) #3
tensor3d = torch.tensor([[[1, 2], [3, 4]],
[[5, 6], [7, 8]]]) #4


#1 Creates a zero-dimensional tensor (scalar) from a Python integer
#2 Creates a one-dimensional tensor (vector) from a Python list
#3 Creates a two-dimensional tensor from a nested Python list
#4 Creates a three-dimensional tensor from a nested Python list

Precision / Tensor Data Types

floatvec = tensor1d.to(torch.float32)
print(floatvec.dtype)
print(tensor2d.matmul(tensor2d.T))
#The output is
#tensor([[14, 32],
#[32, 77]])

#with the @ operator

print(tensor2d @ tensor2d.T)
#tensor([[14, 32],
#[32, 77]])

Autograd - PyTorch’s automatic differentiation engine

# Listing A.2 A logistic regression forward pass
import torch.nn.functional as F #1

y = torch.tensor([1.0]) #2 True label
x1 = torch.tensor([1.1]) #3 Input feature
w1 = torch.tensor([2.2]) #4 Weight parameter
b = torch.tensor([0.0]) #5 Bias unit

z = x1 * w1 + b #6 Net input
a = torch.sigmoid(z) #7 Activation and output

loss = F.binary_cross_entropy(a, y)

alt text

alt text

Partial Derivatives And Gradients

#Listing A.3 Computing gradients via autograd
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = x1 * w1 + b
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a, y)
grad_L_w1 = grad(loss, w1, retain_graph=True)
"""
# 1 By default, PyTorch destroys the computation graph after
calculating the gradients to free memory. However, since we
will reuse this computation graph shortly, we set
retain_graph=True so that it stays in memory.
"""

grad_L_b = grad(loss, b, retain_graph=True


print(grad_L_w1)
print(grad_L_b)

#(tensor([-0.0898]),)
#(tensor([-0.0817]),)
loss.backward()
print(w1.grad)
print(b.grad)
#The outputs are
(tensor([-0.0898]),)
(tensor([-0.0817]),)

Implementing multilayer/deep neural networks

alt text

# Listing A.4 A multilayer perceptron with two hidden layers

class NeuralNetwork(torch.nn.Module):
    """
    A multilayer perceptron with two hidden layers.
    
    We can subclass the torch.nn.Module class to define our own custom network 
    architecture. This Module base class provides a lot of functionality, making 
    it easier to build and train models. For instance, it allows us to encapsulate 
    layers and operations and keep track of the model's parameters.
    """

    def __init__(self, num_inputs, num_outputs):
        """
        Initialize the neural network with specified input and output dimensions.
        
        Within this subclass, we define the network layers in the __init__ constructor 
        and specify how the layers interact in the forward method.
        
        Args:
            num_inputs (int): Number of input features. Coding the number of inputs 
                and outputs as variables allows us to reuse the same code for datasets 
                with different numbers of features and classes.
            num_outputs (int): Number of output classes/neurons.
        """
        super().__init__()
        self.layers = torch.nn.Sequential(
            # 1st hidden layer
            torch.nn.Linear(num_inputs, 30),  
            # The Linear layer takes the number of input and output nodes as arguments
            torch.nn.ReLU(),  
            # Nonlinear activation functions are placed between the hidden layers

            # 2nd hidden layer  
            torch.nn.Linear(30, 20),  
            # The number of output nodes of one hidden layer has
            # to match the number of inputs of the 
            # next layer
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20, num_outputs),
        )

    def forward(self, x):
        """
        Define the forward pass of the network.
        
        The forward method describes how the input data passes through the network 
        and comes together as a computation graph.
        
        Args:
            x (torch.Tensor): Input tensor.
            
        Returns:
            torch.Tensor: The outputs of the last layer (logits).
            
        Note:
            In contrast, the backward method, which we typically do not need to 
            implement ourselves, is used during training to compute gradients of 
            the loss function given the model parameters.
        """
        logits = self.layers(x)
        return logits  # The outputs of the last layer are called logits


# Example usage
model = NeuralNetwork(50, 3)
print(model)
# Output:
# NeuralNetwork(
#   (layers): Sequential(
#     (0): Linear(in_features=50, out_features=30, bias=True)
#     (1): ReLU()
#     (2): Linear(in_features=30, out_features=20, bias=True)
#     (3): ReLU()
#     (4): Linear(in_features=20, out_features=3, bias=True)
#   )
# )

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable model parameters:", num_params)

#Total number of trainable model parameters: 2213

print(model.layers[0].weight)
# This prints
# Parameter containing:
# tensor([[ 0.1174, -0.1350, -0.1227, ..., 0.0275, -0.0520, -0.0192],
# [-0.0169, 0.1265, 0.0255, ..., -0.1247, 0.1191, -0.0698],
# [-0.0973, -0.0974, -0.0739, ..., -0.0068, -0.0892, 0.1070],
# ...,
# [-0.0681, 0.1058, -0.0315, ..., -0.1081, -0.0290, -0.1374],
# [-0.0159, 0.0587, -0.0916, ..., -0.1153, 0.0700, 0.0770],
# [-0.1019, 0.1345, -0.0176, ..., 0.0114, -0.0559, -0.0088]],
# requires_grad=True)

print(model.layers[0].weight.shape)
#The result is
torch.Size([30, 50])
#(Similarly, you could access the bias vector via
model.layers[0].bias
torch.manual_seed(123)
model = NeuralNetwork(50, 3)

print(model.layers[0].weight) 

# Parameter containing:
# tensor([[-0.0577, 0.0047, -0.0702, ..., 0.0222, 0.1260, 0.0865],
# [ 0.0502, 0.0307, 0.0333, ..., 0.0951, 0.1134, -0.0297],
# [ 0.1077, -0.1108, 0.0122, ..., 0.0108, -0.1049, -0.1063],
# ...,
# [-0.0787, 0.1259, 0.0803, ..., 0.1218, 0.1303, -0.1351],
# [ 0.1359, 0.0175, -0.0673, ..., 0.0674, 0.0676, 0.1058],
# [ 0.0790, 0.1343, -0.0293, ..., 0.0344, -0.0971, -0.0509]],
# requires_grad=True)
torch.manual_seed(123)
X = torch.rand((1, 50)) #toy input
out = model(X)

print(out)
# tensor([[-0.1262, 0.1080, -0.1792]], grad_fn=<AddmmBackward0>)
print(out)
# tensor([[-0.1262, 0.1080, -0.1792]])
with torch.no_grad():
out = torch.softmax(model(X), dim=1)
print(out)
# This prints
tensor([[0.3113, 0.3934, 0.2952]]))

Setting up efficient data loaders

alt text

#Listing A.5 Creating a small toy dataset
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])

y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])

y_test = torch.tensor([0, 1])
#Listing A.6 Defining a custom Dataset class
from torch.utils.data import Dataset
class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y

    def __getitem__(self, index): #1
        one_x = self.features[index] #1
        one_y = self.labels[index] #1
        return one_x, one_y #1

def __len__(self):
    return self.labels.shape[0] #2

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

#1 Instructions for retrieving exactly one data record and the corresponding label
#2 Instructions for returning the total length of the dataset
#Listing A.7 Instantiating data loaders
from torch.utils.data import DataLoader
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds, #1
    batch_size=2,
    shuffle=True, #2
    num_workers=0 #3
    drop_last=True
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False, #4
    num_workers=0
)

#1 The ToyDataset instance created earlier serves as input to the data loader.
#2 Whether or not to shuffle the data
#3 The number of background processes
#4 It is not necessary to shuffle a test dataset.

A typical training loop

Saving and loading models

Optimizing training performance with GPUs