# Example of fitting a Hinge Loss, L2-regularized SVM

We use PyTorch to fit an SVM (using "gradient" descent) to separate t-shirts from pants in the FashionMNIST data set.

Recall the SVM objective
$$
J(\omega, b) = \frac{1}{n} \sum_{i=1}^{n} \left(1 - y_i (\langle \omega, x_i \rangle + b)\right)_+ + \lambda \|\omega\|_2^2
$$

### Note:
Fixed bugs relating to running with cuda instead of cpu

In [None]:
## Import all the classes we will need

# PyTorch functions for manipulating tensors with autograd, working on hardware accelerators (GPUs, etc)
import torch

# PyTorch functions for dealing with data sets
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, Subset

# TorchVision functions for dealing with vision data sets
from torchvision import datasets
import torchvision.transforms as T

# Matplotlib for visualization
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid

# use a GPU if it is present
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

In [None]:
## Download and load our train and test data sets, transforming them as we load for convenience

training_data = datasets.FashionMNIST(
	root = "data",
	train = True,
	download = True,
    transform = T.Compose([T.ToTensor(), T.Lambda(torch.flatten)]) # converts the images to tensors [1, 28, 28], then flatten them to vectors [1, 784]
)

testing_data = datasets.FashionMNIST(
	root = "data",
	train = False,
	download = True,
    transform = T.Compose([T.ToTensor(), T.Lambda(torch.flatten)])
)

# Extract class 0 and class 1 since we are doing binary classification 

train_idx = torch.where(training_data.targets < 2)[0]
train_subset = Subset(training_data, train_idx) # a PyTorch convenience function for extracting specific data points

test_idx = torch.where(testing_data.targets < 2)[0]
test_subset = Subset(testing_data, test_idx)

In [None]:
## Convert these PyTorch data sets into X, y matrix vector pairs

# DataLoaders are very convenient Python iterators for when we need to deal with shuffled 
# minibatches over multiple epochs, and our data can't all fit in memory at once. 
# Here we will not really use their features. We just use them to load all our training and 
# test data into the matrices
train_dataloader = DataLoader(train_subset, batch_size=len(train_subset), shuffle=False)
test_dataloader = DataLoader(test_subset, batch_size=len(test_subset), shuffle=False)

Xtrain, ytrain= next(iter(train_dataloader))
Xtrain = Xtrain.to(device) # send the data to the device we're using
ytrain = ytrain.apply_(lambda c: 2*c - 1).view((len(train_subset), 1)).to(device) # convert {0,1} labels to {1,-1} labels

Xtest, ytest= next(iter(test_dataloader))
Xtest = Xtest.to(device)
ytest = ytest.apply_(lambda c: 2*c - 1).view((len(test_subset), 1)).to(device)

In [None]:
# Visualize examples of the two classes we are separating

nexamples = 7

exemplars0 = [ training_data.data[idx] for idx in torch.where(training_data.targets == 0)[0][:nexamples] ]
exemplars1 = [ training_data.data[idx] for idx in torch.where(training_data.targets == 1)[0][:nexamples] ]

fig = plt.figure(figsize=(2 * nexamples, 4))
grid = ImageGrid(fig, 111, 
                 nrows_ncols=(2, nexamples),
                 axes_pad=0.1,
                 )
for ax, im in zip(grid, exemplars0 + exemplars1):
    # Iterating over the grid returns the Axes.
    ax.imshow(im)

plt.show()

In [None]:
## Use "gradient" descent to fit our SVM model

# Choose hyperparameters
ll = 0 # regularization parameter
alpha = .1 # stepsize, aka learning rate

# Initialize model parameters and indicate gradients need to accumulate for them
bias = torch.zeros((1,), device=device, requires_grad=True)
weights = torch.zeros((784,1), device=device, requires_grad=True)

# Do several rounds of optimization
numepochs = 100
losshistory = []
accuracyhistory = []

for curepoch in range(1,numepochs):
    # reset the gradients to None after each gradient step, to prevent accumulation
    weights.grad = None
    bias.grad = None

    # Calculate the objective
    predmargin = Xtrain @ weights + bias
    loss = torch.mean(torch.max(torch.Tensor([0]).to(device), 1 - ytrain*predmargin)) + ll * weights.T @ weights

    # Use autograd to compute the gradients with respect to the model parameters
    loss.backward()

    # Store the history of the losses and the accuracies
    losshistory.append(loss.item())
    accuracyhistory.append(torch.mean(torch.eq(torch.sign(predmargin * ytrain), torch.ones_like(ytrain)).float()).item())

    # Update the model parameters to move along the negative of their gradients
    weights.data = weights.data - alpha * weights.grad
    bias.data = bias.data - alpha * bias.grad

In [None]:
# Visualize the training losses and accuracies

plt.figure(figsize=(15, 5))
plt.subplot(121)
plt.plot(range(1, numepochs), losshistory, 'r*-')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")

plt.subplot(122)
plt.plot(range(1, numepochs), accuracyhistory, 'g+-')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training Accuracy")

plt.show()

In [None]:
## Display the final accuracies to see how linearly separable the two classes are

# Compute the testing accuracy
testpred = Xtest @ weights + bias
testingaccuracy = torch.mean(torch.eq(torch.sign(testpred * ytest), torch.ones_like(ytest)).float())

print(f"Final training accuracy: {accuracyhistory[-1]}")
print(f"Testing accuracy: {testingaccuracy}")
print(f"Generalization gap: {accuracyhistory[-1] - testingaccuracy}")