Lecture 5 Optimization Algorithms¶

2/4/2021

Gradient Descent¶

import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress=True)
plt.rcParams.update({'font.size': 16})

# Setup the problem
N = 50
x = 2*np.random.rand(N)-1
a = np.array([0.5, -2, -3, 4, 6])
y = a[0] + a[1]*x + a[2]*x**2 + a[3]*x**3 + \
    a[4]*x**4 + 0.25*np.random.randn(N)

# Construct X matrix
d = 5
X = np.zeros((N, d))
for j in range(d):
  X[:,j] = x**j

# Initialize gradient descent
theta = np.zeros(d)
cost  = np.zeros(1000)
XtX = np.dot( np.transpose(X), X)

# Gradient descent
for itr in range(1000):
  dJ     = np.dot(np.transpose(X), np.dot(X, theta)-y)
  dd     = dJ
  alpha  = np.dot(dJ, dd) / np.dot(np.dot(XtX, dd), dd)
  theta  = theta - alpha*dd
  cost[itr] = np.linalg.norm(np.dot(X, theta)-y)**2/N

# Plotting
fig = plt.figure(figsize = (8,5))   
plt.semilogx(cost,'o')

[<matplotlib.lines.Line2D at 0x7f93659fff98>]

Visualizing gradient descent¶

import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress=True)
plt.rcParams.update({'font.size': 22})

# Setup the problem
N = 50
x = 2*np.random.rand(N)-1
a = np.array([0.5, 0.25])
y = a[0] + a[1]*x + 0.25*np.random.randn(N)

# Construct X matrix
d = 2
X = np.zeros((N, d))
for j in range(d):
  X[:,j] = x**j

# Setup the grid for visualization
xgrid = np.linspace(0.5-1, 0.5+1, 200)
ygrid = np.linspace(0.25-1,0.25+1,200)
theta0, theta1 = np.meshgrid(xgrid, ygrid)
cost = np.zeros((len(xgrid),len(ygrid)))
for i in range(len(xgrid)):
  for j in range(len(ygrid)):
    theta_tmp = np.hstack((theta0[i,j],theta1[i,j]))
    cost[i,j] = np.log(np.linalg.norm(np.dot(X, theta_tmp)-y)**2/N)

# Initialize gradient descent
max_itr = 1000
theta = np.array([-0.5, -0.75])
XtX   = np.dot( np.transpose(X), X)
theta_store = np.zeros((d,max_itr+1))
theta_store[0,0] = -0.5
theta_store[1,0] = -0.75

# Gradient descent
for itr in range(max_itr):
  dJ     = np.dot(np.transpose(X), np.dot(X, theta)-y)
  dd     = dJ
  alpha  = np.dot(dJ, dd) / np.dot(np.dot(XtX, dd), dd)
  theta  = theta - alpha*dd
  theta_store[:,itr+1] = theta  

# Plotting
fig = plt.figure(figsize = (6,6))    
plt.contourf(xgrid, ygrid, cost)

for i in range(10):
  plt.plot(theta_store[0,i:i+2], theta_store[1,i:i+2], c='w',linewidth=2)

Momentum Acceleration¶

import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress=True)
plt.rcParams.update({'font.size': 22})

# Setup the problem
N = 50
x = 2*np.random.rand(N)-1
a = np.array([0.5, -2, -3, 4, 6])
y = a[0] + a[1]*x + a[2]*x**2 + a[3]*x**3 + \
    a[4]*x**4 + 0.1*np.random.randn(N)

# Construct X matrix
d = 5
X = np.zeros((N, d))
for j in range(d):
  X[:,j] = x**j

theta  = np.zeros(d)
cost   = np.zeros(1000)
XtX    = np.dot( np.transpose(X), X )
dJ_old = np.zeros(d)

beta  = 0.9
for itr in range(1000):
  dJ     = np.dot(np.transpose(X), np.dot(X, theta)-y)
  dd     = beta*dJ_old + (1-beta)*dJ
  alpha  = np.dot(dJ, dd) / np.dot(np.dot(XtX, dd), dd)
  theta  = theta - alpha*dd
  dJ_old = dJ
  cost[itr] = np.linalg.norm(np.dot(X, theta)-y)**2/N

# Plotting
fig = plt.figure(figsize = (8,5))   
plt.semilogx(cost,'o')

[<matplotlib.lines.Line2D at 0x7f9365b20780>]

Visualize Momentum Acceleration¶

import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress=True)
plt.rcParams.update({'font.size': 22})

# Setup the problem
N = 50
x = 2*np.random.rand(N)-1
a = np.array([0.5, 0.25])
y = a[0] + a[1]*x + 0.25*np.random.randn(N)

# Construct X matrix
d = 2
X = np.zeros((N, d))
for j in range(d):
  X[:,j] = x**j

# Setup the grid for visualization
xgrid = np.linspace(0.5-1, 0.5+1, 100)
ygrid = np.linspace(0.25-1,0.25+1,100)
theta0, theta1 = np.meshgrid(xgrid, ygrid)
cost = np.zeros((len(xgrid),len(ygrid)))
for i in range(len(xgrid)):
  for j in range(len(ygrid)):
    theta_tmp = np.hstack((theta0[i,j],theta1[i,j]))
    cost[i,j] = np.log(np.linalg.norm(np.dot(X, theta_tmp)-y)**2/N)

# Initialize momentum
max_itr = 1000
theta = np.array([-0.5, -0.75])
XtX   = np.dot( np.transpose(X), X)
dJ_old = np.zeros(d)
theta_store = np.zeros((d,max_itr+1))
theta_store[0,0] = -0.5
theta_store[1,0] = -0.75

beta = 0.9
# momentum
for itr in range(max_itr):
  dJ     = np.dot(np.transpose(X), np.dot(X, theta)-y)
  dd     = beta*dJ_old + dJ
  alpha  = np.dot(dJ, dd) / np.dot(np.dot(XtX, dd), dd)
  theta  = theta - alpha*dd
  dJ_old = dJ
  theta_store[:,itr+1] = theta  

# Plotting
fig = plt.figure(figsize = (8,5))    
plt.contourf(xgrid, ygrid, cost)
plt.axis('square')

for i in range(10):
  plt.plot(theta_store[0,i:i+2], theta_store[1,i:i+2], c='w',linewidth=2)

Stochastic Gradient Descent¶

import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress=True)
plt.rcParams.update({'font.size': 16})

# Setup the problem
N = 5000
x = 2*np.random.rand(N)-1
a = np.array([0.5, -2, -3, 4, 6])
y = a[0] + a[1]*x + a[2]*x**2 + a[3]*x**3 + \
    a[4]*x**4 + 0.25*np.random.randn(N)

# Construct X matrix
d = 5
X = np.zeros((N, d))
for j in range(d):
  X[:,j] = x**j

# Initialize gradient descent
theta = np.zeros(d)
cost  = np.zeros(1000)

# Gradient descent
B = 50
for itr in range(1000):
  idx    = np.random.randint(N, size=B)
  Xsub   = X[idx,:]
  ysub   = y[idx]
  XtX = np.dot( np.transpose(Xsub), Xsub)
  dJ     = np.dot(np.transpose(Xsub), np.dot(Xsub, theta)-ysub)
  dd     = dJ
  alpha  = np.dot(dJ, dd) / np.dot(np.dot(XtX, dd), dd)
  theta  = theta - alpha*dd
  cost[itr] = np.linalg.norm(np.dot(X, theta)-y)**2/N

# Plotting
fig = plt.figure(figsize = (8,5))   
plt.semilogx(cost,'o')

[<matplotlib.lines.Line2D at 0x7f9366140da0>]

Visualize Stochastic Gradient Descent¶

import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress=True)
plt.rcParams.update({'font.size': 22})

# Setup the problem
N = 50
x = 2*np.random.rand(N)-1
a = np.array([0.5, 0.25])
y = a[0] + a[1]*x + 0.25*np.random.randn(N)

# Construct X matrix
d = 2
X = np.zeros((N, d))
for j in range(d):
  X[:,j] = x**j

# Setup the grid for visualization
xgrid = np.linspace(0.5-1, 0.5+1, 100)
ygrid = np.linspace(0.25-1,0.25+1,100)
theta0, theta1 = np.meshgrid(xgrid, ygrid)
cost = np.zeros((len(xgrid),len(ygrid)))
for i in range(len(xgrid)):
  for j in range(len(ygrid)):
    theta_tmp = np.hstack((theta0[i,j],theta1[i,j]))
    cost[i,j] = np.log(np.linalg.norm(np.dot(X, theta_tmp)-y)**2/N)

# Initialize stochastic gradient descent
max_itr = 1000
theta = np.array([-0.5, -0.75])
XtX   = np.dot( np.transpose(X), X)
dJ_old = np.zeros(d)
theta_store = np.zeros((d,max_itr+1))
theta_store[0,0] = -0.5
theta_store[1,0] = -0.75
B = 50


# Stochastic Gradient descent
for itr in range(max_itr):
  idx    = np.random.randint(N, size=B)
  Xsub   = X[idx,:]
  ysub   = y[idx]  
  XtX    = np.dot( np.transpose(Xsub), Xsub)
  dJ     = np.dot(np.transpose(Xsub), np.dot(Xsub, theta)-ysub)
  dd      = dJ
  alpha  = np.dot(dJ, dd) / np.dot(np.dot(XtX, dd), dd)
  theta  = theta - alpha*dd
  theta_store[:,itr+1] = theta  

# Plotting
fig = plt.figure(figsize = (8,5))    
plt.contourf(xgrid, ygrid, cost)
plt.axis('square')

for i in range(10):
  plt.plot(theta_store[0,i:i+2], theta_store[1,i:i+2], c='w',linewidth=2)

%%shell
jupyter nbconvert --to html /content/ECE595_lecture05_gradient.ipynb