PyTorch

PyTorch is a powerful deep learning framework that provides flexible and efficient tools for building neural networks. Developed by Facebook's AI Research lab, PyTorch is widely used in research and production due to its dynamic computation graphs, intuitive Python API, and excellent debugging capabilities. It has become one of the most popular frameworks for machine learning research and applications.

PyTorch Tensors

Tensors are the fundamental data structure in PyTorch, similar to NumPy arrays but with GPU acceleration capabilities.

import torch
import numpy as np

# Creating tensors
x = torch.tensor([1, 2, 3, 4, 5])
y = torch.zeros(3, 4)           # 3x4 tensor of zeros
z = torch.ones(2, 3, 4)         # 2x3x4 tensor of ones
r = torch.randn(3, 3)           # 3x3 tensor with random normal values

# From NumPy arrays
np_array = np.array([1, 2, 3])
tensor_from_np = torch.from_numpy(np_array)

# Tensor operations
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])

c = a + b                       # Element-wise addition
d = torch.matmul(a, b)          # Dot product
e = a * 2                       # Scalar multiplication

# Tensor properties
print(f"Shape: {a.shape}")
print(f"Data type: {a.dtype}")
print(f"Device: {a.device}")

# Reshaping
x = torch.randn(4, 4)
y = x.view(16)                  # Reshape to 1D
z = x.view(-1, 8)               # Reshape to 2x8 (infer first dimension)

Using Devices (CPU and GPU)

PyTorch allows seamless transfer of tensors between CPU and GPU for accelerated computation.

import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check number of GPUs
if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

# Create tensor on specific device
x_cpu = torch.tensor([1, 2, 3])
x_gpu = torch.tensor([1, 2, 3], device="cuda")  # Directly on GPU

# Move tensors between devices
x = torch.randn(3, 3)
x_on_gpu = x.to(device)         # Move to GPU
x_back_to_cpu = x_on_gpu.cpu()  # Move back to CPU

# Operations on GPU
a = torch.randn(1000, 1000, device=device)
b = torch.randn(1000, 1000, device=device)
c = torch.matmul(a, b)          # Runs on GPU if device is cuda

# Best practice: use device variable
model = YourModel().to(device)
data = data.to(device)

Declaring Parameters

Parameters are tensors that require gradient computation and are automatically tracked by optimizers.

import torch
import torch.nn as nn

# Manual parameter creation
weights = torch.randn(5, 3, requires_grad=True)
bias = torch.zeros(3, requires_grad=True)

# Using nn.Parameter
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Parameters are automatically registered
        self.weight = nn.Parameter(torch.randn(10, 5))
        self.bias = nn.Parameter(torch.zeros(5))

    def forward(self, x):
        return torch.matmul(x, self.weight) + self.bias

model = SimpleModel()

# Access all parameters
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}, requires_grad={param.requires_grad}")

# Common parameter initialization
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)

model.apply(init_weights)

Constructing Models with Module Class

The nn.Module class is the base class for all neural network modules in PyTorch.

import torch
import torch.nn as nn
import torch.nn.functional as F

# Basic model structure
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()

        # Define layers
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # Define forward pass
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Instantiate model
model = NeuralNetwork(input_size=784, hidden_size=128, output_size=10)

# Using Sequential for simpler models
sequential_model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(128, 10)
)

# Convolutional Neural Network example
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

cnn_model = CNN()
print(cnn_model)

Computing Gradients

PyTorch uses automatic differentiation to compute gradients for backpropagation.

import torch

# Enable gradient tracking
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)

# Forward pass
z = x**2 + y**3
loss = z.mean()

# Compute gradients
loss.backward()

# Access gradients
print(f"dz/dx = {x.grad}")  # Should be 2*x = 4.0
print(f"dz/dy = {y.grad}")  # Should be 3*y^2 = 27.0

# Gradient accumulation example
x = torch.tensor([1.0], requires_grad=True)

# First computation
y1 = x * 2
y1.backward()
print(f"After first backward: {x.grad}")  # 2.0

# Gradients accumulate
y2 = x * 3
y2.backward()
print(f"After second backward: {x.grad}")  # 5.0 (2.0 + 3.0)

# Zero gradients before new computation
x.grad.zero_()
y3 = x * 4
y3.backward()
print(f"After zeroing: {x.grad}")  # 4.0

# Context managers for gradient control
with torch.no_grad():
    # Operations here won't track gradients
    z = x * 2
    # More efficient for inference

# Detach tensors from computation graph
x = torch.tensor([1.0], requires_grad=True)
y = x * 2
y_detached = y.detach()  # No longer tracks gradients

Creating a Training Loop with Optimizer

Putting it all together: a complete training loop with optimizer.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define model
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleNN(input_dim=10, hidden_dim=64, output_dim=2).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Optional: learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Create dummy dataset
X_train = torch.randn(1000, 10)
y_train = torch.randint(0, 2, (1000,))
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 50

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        # Move data to device
        data, target = data.to(device), target.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(data)
        loss = criterion(outputs, target)

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        # Track metrics
        epoch_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()

    # Update learning rate
    scheduler.step()

    # Print statistics
    avg_loss = epoch_loss / len(train_loader)
    accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Evaluation mode
model.eval()
with torch.no_grad():
    # Evaluation code here
    X_test = torch.randn(100, 10).to(device)
    predictions = model(X_test)
    print(f"Test predictions shape: {predictions.shape}")

# Save model
torch.save(model.state_dict(), "model.pth")

# Load model
model = SimpleNN(input_dim=10, hidden_dim=64, output_dim=2)
model.load_state_dict(torch.load("model.pth"))
model.to(device)

Additional Tips

# Gradient clipping (prevents exploding gradients)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

# Different optimizers
optimizer_sgd = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer_adam = optim.Adam(model.parameters(), lr=0.001)
optimizer_rmsprop = optim.RMSprop(model.parameters(), lr=0.01)

# Mixed precision training (faster on modern GPUs)
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for data, target in train_loader:
    optimizer.zero_grad()

    with autocast():
        output = model(data)
        loss = criterion(output, target)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

PyTorch's flexibility and intuitive design make it an excellent choice for both research and production deep learning applications, providing powerful tools for building and training complex neural networks efficiently.