import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(123)
n = 10000
p = 5
theta_true = np.array([-2, -1, 0, 1, 2])
On considère le modèle
$$ Y_i = X_i^T \theta + \varepsilon_i, $$avec :
On prend $n = 10000$.
X = np.random.randn(n, p)
Y = X @ theta_true + np.random.randn(n)
def newton_linear(X, Y):
n, p = X.shape
theta = np.zeros((n+1, p))
H = np.eye(p) # H^{-1}_0
for i in range(n):
# Gradient stochastique
grad = -(Y[i] - X[i] @ theta[i]) * X[i]
# Mise à jour de theta
step = (i+1) / (i+100)
theta[i+1] = theta[i] - step * H @ grad
# Mise à jour de H^{-1} (formule de Sherman-Morrison)
u = H @ X[i]
denom = 1 + X[i] @ u
H = H - np.outer(u, u) / denom
return theta
def asgd_linear(X, Y, c=1,alpha=0.75):
n, p = X.shape
theta = np.zeros(p)
thetabar = np.zeros(p)
theta_path = np.zeros((n+1, p))
thetabar_path = np.zeros((n+1, p))
for i in range(n):
grad = (X[i] @ theta - Y[i]) * X[i]
theta = theta - c*(i+1)**(-alpha) * grad
thetabar = thetabar + (theta- thetabar) / (i + 1)
theta_path[i+1] = theta
thetabar_path[i+1] = thetabar
return theta_path, thetabar_path
theta_new = newton_linear(X,Y)
theta_path, thetabar_path = asgd_linear(X, Y)
errnew = np.sum((theta_new - theta_true)**2, axis=1)
err = np.sum((theta_path - theta_true)**2, axis=1)
errbar = np.sum((thetabar_path - theta_true)**2, axis=1)
plt.figure()
plt.plot(err, label="SGD")
plt.plot(errbar, label="ASGD")
plt.plot(errnew, label="Newton")
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Iteration")
plt.ylabel("Squared error")
plt.legend()
plt.show()
B = 50
n, p = X.shape
# Matrice des erreurs : (n+1) x B
err_mat = np.zeros((n + 1, B))
err_matbar = np.zeros((n + 1, B))
err_new = np.zeros((n + 1, B))
theta_true_mat = np.tile(theta_true, (n + 1, 1))
for b in range(B):
Xb = np.random.randn(n, p)
Yb = Xb @ theta_true + np.random.randn(n)
th,thbar = asgd_linear(Xb, Yb)
thnew = newton_linear(Xb,Yb)
err_mat[:, b] = np.sum((th - theta_true_mat)**2, axis=1)
err_new[:, b] = np.sum((thnew - theta_true_mat)**2, axis=1)
err_matbar[:, b] = np.sum((thbar - theta_true_mat)**2, axis=1)
# Moyenne sur les répétitions
err_mean = err_mat.mean(axis=1)
err_meannew = err_new.mean(axis=1)
err_meanbar = err_matbar.mean(axis=1)
i = np.arange(1, n + 2)
# Plot
plt.figure()
plt.plot(i,err_mean, label="SGD",linewidth=1)
plt.plot(i,err_meanbar, label="ASGD",linewidth=1)
plt.plot(i,err_meannew, label="Newton",linewidth=1)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("n")
plt.ylabel("Erreur quadratique moyenne")
plt.grid(True, which="both", linestyle="--", alpha=0.1)
plt.show()
On prend maintenant $X \sim \mathcal{N}(0, D)$ avec $D = \text{diag}\left( 10^{-2} , 10^{-1} , 1 , 10, 10^{2} \right)$. Regarder les évolutions des erreurs quadratiques moyennes pour les estimateurs de gradient stochastique et leurs versions moyennées pour différentes valeurs de $c_{\gamma}$.
D = np.sqrt(np.diag([1e-2, 1e-1, 1, 10, 1e2]))
X = np.random.randn(n, p) @ D
Y = X @ theta_true + np.random.randn(n)
B = 50
n, p = X.shape
# Matrice des erreurs : (n+1) x B
err_matnew = np.zeros((n + 1, B))
err_mat = np.zeros((n + 1, B))
err_matbar = np.zeros((n + 1, B))
theta_true_mat = np.tile(theta_true, (n + 1, 1))
for b in range(B):
Xb = np.random.randn(n, p)@D
Yb = Xb @ theta_true + np.random.randn(n)
th,thbar = asgd_linear(Xb, Yb,c=0.01)
thnew=newton_linear(Xb,Yb)
err_mat[:, b] = np.sum((th - theta_true_mat)**2, axis=1)
err_matnew[:, b] = np.sum((thnew - theta_true_mat)**2, axis=1)
err_matbar[:, b] = np.sum((thbar - theta_true_mat)**2, axis=1)
# Moyenne sur les répétitions
err_mean = err_mat.mean(axis=1)
err_meanbar = err_matbar.mean(axis=1)
err_meannew = err_matnew.mean(axis=1)
i = np.arange(1, n + 2)
# Plot
plt.figure()
plt.plot(i,err_mean, label="SGD",linewidth=1)
plt.plot(i,err_meanbar, label="ASGD",linewidth=1)
plt.plot(i,err_meannew, label="Newton",linewidth=1)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("n")
plt.ylabel("Erreur quadratique moyenne")
plt.grid(True, which="both", linestyle="--", alpha=0.1)
plt.show()