Chain rule:
PyTorch tensor optionally saves gradients in addition to values.
And uses autograd to automatically compute gradients:
x = torch.tensor(1.0, requires_grad=True)
y = x * 2
z = y**2
z.backward() # automatically computes gradients
print(x.grad) # 4.0
Gradient is always w.r.t.
Each tensor has both a value and a gradient
Forward pass: fill in values
Backward pass: fill in gradients

Let's say
Calculate
Suppose
Calculate
Objectives:
loss.backward().
optimizer.step() to update parameters.outputs = model(inputs)
loss = criterion(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss.item())
model = VGG16().cuda() # Move model to GPU
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
for epoch in range(num_epochs):
for batch_idx, (images, labels) in enumerate(train_loader):
# Move data to GPU
images = images.cuda() # shape: [batch_size, 3, 224, 224]
labels = labels.cuda() # shape: [batch_size]
outputs = model(images) # shape: [batch_size, 1000]
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch_idx % 100 == 0:
print(f'Epoch [{epoch+1}/{num_epochs}], '
f'Loss: {loss.item():.4f}')
training_args = TrainingArguments(
output_dir='./output',
logging_dir='./logs',
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
# num_train_epochs=200,
max_steps=20000,
warmup_steps=400,
weight_decay=0.01,
logging_steps=100,
bf16=True,
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
metric_for_best_model="eval_F1",
greater_is_better=False,
)
trainer = Trainer(model=model, args=training_args, ...)
trainer.train()

import wandb
wandb.init(project=f"project", name='curve')
... Training Code ...
| Predicted\Actual | Class 1 | Class 2 | Class 3 | Class 4 | Class 5 |
|---|---|---|---|---|---|
| Class 1 | 85 | 7 | 3 | 4 | 1 |
| Class 2 | 5 | 90 | 6 | 2 | 2 |
| Class 3 | 2 | 4 | 88 | 5 | 1 |
| Class 4 | 3 | 2 | 4 | 82 | 4 |
| Class 5 | 1 | 3 | 2 | 3 | 91 |
sklearn.metrics.confusion_matrix(y_true, y_pred, ...)
| Predicted\Actual | False | True |
|---|---|---|
| False | TN | FN |
| True | FP | TP |

Objectives:
class torch.optim.SGD(lr=0.001, momentum=0, weight_decay=0, ...)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=400,
num_training_steps=20000,
)
# PyTorch
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
for epoch in range(num_epochs):
for inputs, targets in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
# HuggingFace
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=400,
num_training_steps=len(train_dataset) * training_args.num_train_epochs
)
trainer = Trainer(.....optimizers=(optimizer, lr_scheduler))
trainer.train()
Different parameters may require different step sizes.
Adaptive optimizers adjust the effective LR for each parameter based on historical gradients.
Often converge faster or require less manual tuning than plain SGD.

Maintains a moving average of squared gradients:
Update:
Good for non-stationary environments
Widely used in RNNs
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
Adam: weight decay is coupled with the gradient
AdamW: weight decay decoupled and directly applied to the weights
Adagrad: Accumulates historical gradients, good for sparse data but LR decreases over time.
Adadelta: Tries to fix Adagrad’s diminishing LR issue.
Generally, AdamW and Adam is more popular due to good defaults.
Objectives:
| Sacrifice \Gain | Space | Time | Accuracy |
|---|---|---|---|
| Space | - | ||
| Time | - | ||
| Accuracy | - |
Generalized CAP Theorem: out of three, you can only have two.
(The original CAP theorem is about Consistency, Availability, and Partition tolerance of distributed systems.)
from torch.utils.checkpoint import checkpoint
class SimpleModel(nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.layer1 = nn.Linear(1024, 1024)
self.layer2 = nn.Linear(1024, 1024)
self.layer3 = nn.Linear(1024, 1024)
def forward(self, x):
# Use checkpointing on layers to save memory
x = checkpoint(self.layer1, x) # orignally: x = self.layer1(x)
x = checkpoint(self.layer2, x) # orignally: x = self.layer2(x)
x = self.layer3(x) # No checkpointing for layer3
return x
class CheckpointFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, run_function, *args):
ctx.run_function = run_function
ctx.save_for_backward(*args) # save *args for backward
with torch.no_grad():
outputs = run_function(*args)
return outputs
@staticmethod
def backward(ctx, *grad_outputs):
inputs = ctx.saved_tensors
with torch.enable_grad():
inputs = [x.detach().requires_grad_(True) for x in inputs]
outputs = ctx.run_function(*inputs)
grads = torch.autograd.grad(outputs, inputs, grad_outputs)
return (None, *grads)
def checkpoint(run_function, *args):
return CheckpointFunction.apply(run_function, *args)

Assuming
Computation cost of:
Computation cost of:
Computation cost of:
What's the cost ratio of LoRA to the original?
class LoRAWrapper(nn.Module):
def __init__(self, base_layer, r=32):
super(LoRAWrapper, self).__init__()
self.base_layer = base_layer
self.r = r # Rank of the low-rank approximation
# Create low-rank layers
self.lora_A = nn.Linear(base_layer.in_features, r, bias=False)
self.lora_B = nn.Linear(r, base_layer.out_features, bias=False)
nn.init.kaiming_uniform_(self.lora_A.weight)
nn.init.kaiming_uniform_(self.lora_B.weight)
def forward(self, x):
return self.base_layer(x) + self.lora_B(self.lora_A(x))
# Define a basic model
class BasicModel(nn.Module):
def __init__(self, input_dim, output_dim):
super(BasicModel, self).__init__()
self.linear = nn.Linear(input_dim, output_dim)
def forward(self, x):
return self.linear(x)
# Instantiate the model
basic_model = BasicModel(4096, 4096)
basic_model.linear = LoRAWrapper(basic_model.linear, r=32, alpha=1)
# Forward pass with the modified model
output_with_lora = basic_model(x)
import torch.nn as nn
def patch_model_with_lora(model, r=32):
for name, module in model.named_children():
if isinstance(module, nn.Linear):
setattr(model, name, LoRAWrapper(module, r=r))
else:
patch_model_with_lora(module, r, alpha) # Recursively apply to child modules
return model
model = SimpleModel()
patched_model = patch_model_with_lora(model, r=32)
(We should be able to apply this to Llama 3.x)
Applying LoRA to Self-Attention






In addition to parameters and gradients, maintain
Stage 1: partitioning optimizer states
Stage 2: Gradients
Stage 3: Model Parameters
Time cost is increasingly larger due to more communication.

from accelerate import Accelerator
accelerator = Accelerator()
training_args = TrainingArguments(
...
bf16=True,
deepspeed = "deepspeed.json",
...
)
trainer = Trainer(model=model, args=training_args)
trainer.train()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir,
is_main_process=accelerator.is_main_process,
save_function=accelerator.save)
deepspeed.json{
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "none"
},
"allgather_partitions": true,
"reduce_scatter": true
},
"train_batch_size": 40,
"gradient_accumulation_steps": 1,
"train_micro_batch_size_per_gpu": 8,
"stage3_gather_16bit_weights_on_model_save": false,
"bf16": {
"enabled": true
}
}
deepspeed.yamlcompute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_config_file: deepspeed.json
zero3_init_flag: false
distributed_type: DEEPSPEED
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 5
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
accelerate launch --config_file=deepspeed.yaml ./train.py ......
**Goal:** Provide a strong foundation in how deep learning models are built, how they learn from data via gradients and backpropagation, how to choose and measure progress with loss functions and metrics, how to prepare data efficiently, and how regularization and initialization techniques influence training.
emphasize tensors for efficient parallel processing
A stack of linear layers with nonlinear activations.