library(ggplot2)
library(tidyr)
library(dplyr)
set.seed(123)
n <- 10000
p <- 5
theta_true <- c(-2,-1,0,1,2)

Objectifs du TP

L’objectif de ce TP est d’étudier le comportement des estimateurs moyennés et de Newtonstochastique dans le cadre de la régression linéaire.

1. Estimateur moyenné

On considère le modèle

\[ Y_i = X_i^T \theta + \varepsilon_i, \]

avec :

\(\theta = (-2,-1,0,1,2)^T \in \mathbb{R}^5\),
\(X_i \sim \mathcal{N}(0, I_5)\),
\(\varepsilon_i \sim \mathcal{N}(0,1)\), indépendants.

On note \(n = 10000\).

Générer un échantillon \((X_i, Y_i)_{i=1,\dots,n}\).

X <- matrix(rnorm(n*p), n, p)
Y <- X %*% theta_true + rnorm(n)

Écrire une fonction R qui ressorte l’ensemble des estimateurs \(\theta_i\) et \(\overline{\theta}_{i}\), pour \(i = 0, \ldots, n-1\), définis par la descente de gradient stochastique moyennée: \[ \theta_{i+1} = \theta_{i} - \gamma_{i+1} \nabla_\theta \ell(Y_{i+1}, X_{i+1}, \theta_{i}). \] \[ \overline{\theta}_{i+1} = \frac{i+1}{i+2}\overline{\theta}_{i} + \frac{1}{i+2} \theta_{i+1} . \]

asgd_linear <- function(X, Y, c=1,alpha=0.75) {
  n <- nrow(X); p <- ncol(X)
  theta <- matrix(0, n+1, p)
  thetabar <- matrix(0, n+1, p)
  for(i in 1:n){
    grad <- -(Y[i] - sum(X[i,]*theta[i,])) * X[i,]
    theta[i+1,] <- theta[i,] - c* i^(-alpha)*grad
    thetabar[i+1,] <- thetabar[i,] * (i+1)/(i+2) + 1/(i+2)*theta[i+1,]
  }
  list(theta=theta,thetabar=thetabar)
}
theta_path <- asgd_linear(X,Y)

Tracer l’évolution de l’erreur quadratique \(\|\theta_i-\theta\|^2\) pour un échantillon.

err <- rowSums((theta_path$theta - matrix(theta_true,n+1,p,byrow=TRUE))^2)
errbar <- rowSums((theta_path$thetabar - matrix(theta_true,n+1,p,byrow=TRUE))^2)

df <- data.frame(
  i = seq_along(err),
  err = err,
  errbar=errbar
)
df_long <- df %>%
  pivot_longer(
    cols = c(err, errbar),
    names_to = "method",
    values_to = "error"
  )

ggplot(df_long, aes(x = i, y = error, color = method)) +
  geom_line(linewidth = 1) +
  scale_y_log10() +
  scale_x_log10() +
  scale_color_manual(
    values = c(
      err = "blue",
      errbar = "red"
    ),
    labels = c(
      err = "SGD",
      errbar = "ASGD"
    )
  ) +
  labs(
    x = "n",
    y = "Erreur quadratique",
    color = ""
  ) +
  theme_minimal(base_size = 14)

Répéter l’expérience sur 50 échantillons indépendants et tracer l’erreur quadratique moyenne.

B <- 50
err_mat <- matrix(0,n+1,B)
errbar_mat <- matrix(0,n+1,B)
for(b in 1:B){
  X <- matrix(rnorm(n*p), n, p)
  Y <- X %*% theta_true + rnorm(n)
  th <- asgd_linear(X,Y)
  err_mat[,b] <- rowSums((th$theta - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  errbar_mat[,b] <- rowSums((th$thetabar - matrix(theta_true,n+1,p,byrow=TRUE))^2)
}
df <- data.frame(
  i = seq_along(rowMeans(err_mat)),
  err = rowMeans(err_mat),
  errbar= rowMeans(errbar_mat)
)
df_long <- df %>%
  pivot_longer(
    cols = c(err, errbar),
    names_to = "method",
    values_to = "error"
  )

ggplot(df_long, aes(x = i, y = error, color = method)) +
  geom_line(linewidth = 1) +
  scale_y_log10() +
  scale_x_log10() +
  scale_color_manual(
    values = c(
      err = "blue",
      errbar = "red"
    ),
    labels = c(
      err = "SGD",
      errbar = "ASGD"
    )
  ) +
  labs(
    x = "n",
    y = "Erreur quadratique moyenne",
    color = ""
  ) +
  theme_minimal(base_size = 14)

Refaire la question 4 mais en prenant cette fois-ci \(X \sim \mathcal{N}(0,\text{diag}(1,4,9,16,25))\) et \(c_{\gamma}=0.3,0.1,0.01\).

B <- 50
err_mat_c1 <- matrix(0,n+1,B)
err_mat_c01 <- matrix(0,n+1,B)
err_mat_c001 <- matrix(0,n+1,B)
errbar_mat_c1 <- matrix(0,n+1,B)
errbar_mat_c01 <- matrix(0,n+1,B)
errbar_mat_c001 <- matrix(0,n+1,B)
for(b in 1:B){
  X <- matrix(rnorm(n*p), n, p)%*%diag(1:5)
  Y <- X %*% theta_true + rnorm(n)
  th <- asgd_linear(X,Y,c=0.3)
  th001 <- asgd_linear(X,Y,c=0.01)
  th01 <- asgd_linear(X,Y,c=0.1)
  err_mat_c1[,b] <- rowSums((th$theta - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  err_mat_c01[,b] <- rowSums((th01$theta - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  err_mat_c001[,b] <- rowSums((th001$theta - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  errbar_mat_c1[,b] <- rowSums((th$thetabar - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  errbar_mat_c01[,b] <- rowSums((th01$thetabar - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  errbar_mat_c001[,b] <- rowSums((th001$thetabar - matrix(theta_true,n+1,p,byrow=TRUE))^2)
}
df <- data.frame(
  i = seq_len(nrow(err_mat_c1)),

  SGD_c1   = rowMeans(err_mat_c1),
  SGD_c01  = rowMeans(err_mat_c01),
  SGD_c001 = rowMeans(err_mat_c001),

  ASGD_c1   = rowMeans(errbar_mat_c1),
  ASGD_c01  = rowMeans(errbar_mat_c01),
  ASGD_c001 = rowMeans(errbar_mat_c001)
)

# Passage en format long
df_long <- df %>%
  pivot_longer(
    cols = -i,
    names_to = "method_c",
    values_to = "error"
  ) %>%
  separate(method_c, into = c("method", "c"), sep = "_c") %>%
  mutate(
    c = paste0("c = ", c),
    method = factor(method, levels = c("SGD", "ASGD"))
  )
ggplot(df_long, aes(x = i, y = error,
                    color = method, linetype = c)) +
  geom_line(linewidth = 1) +
  scale_y_log10() +
  labs(
    x = "n",
    y = "Erreur quadratique moyenne",
    color = "",
    linetype = ""
  ) +
  theme_minimal(base_size = 14)

2. Algorithme de Newton

1.Écrire une fonction R qui ressorte l’ensemble des estimateurs \({\theta}_i\) obtenu avec l’algorithme de Newton stochastique : \[ {\theta}_{i+1} = {\theta}_{i} + \frac{i+1}{i+100}H_{i}^{-1}\left( Y_{i+1} - X_{i+1}^{T} {\theta}_{i} \right) X_{i+1} \] avec \[ H_{i+1}^{-1} = H_{i}^{-1} - \left( 1+ X_{i+1}^{T}H_{i}^{-1}X_{i+1} \right)^{-1}H_{i}^{-1}X_{i+1}X_{i+1}^{T}H_{i}^{-1} . \] A noter qu’ici on a pris \(i+100\) au lieu de \(i+1\) pour des questions de stabilités au début de l’algorithme.

Newton_linear <- function(X, Y, c=1,alpha=0.75) {
  n <- nrow(X); p <- ncol(X)
  theta <- matrix(0, n+1, p)
  H=diag(p)
  for(i in 1:n){
    grad <- -(Y[i] - sum(X[i,]*theta[i,])) * X[i,]
    theta[i+1,] <- theta[i,] - (i+1)/(i+100)*H%*%grad
    u=H%*%(X[i,])
    ut=t(u)
    H=H-(1+sum(X[i,]*u))^(-1)*u%*%ut
  }
  theta
}
theta_newton <- Newton_linear(X,Y)

Tracer l’évolution de l’erreur quadratique \(\|\theta_i-\theta\|^2\) pour un échantillon.

err <- rowSums((theta_newton - matrix(theta_true,n+1,p,byrow=TRUE))^2)

df <- data.frame(
  i = seq_along(err),
  err = err
)

ggplot(df, aes(x = i, y = err)) +
  geom_line(color = "blue", linewidth = 1) +
  scale_x_log10() +
  scale_y_log10() +
  labs(
    x = "n",
    y = "Erreur quadratique"
  ) +
  theme_minimal()

Reprendre la question 5 précédente en rajoutant l’algorithme de Newton stochastique.

B <- 50
err_newton <- matrix(0,n+1,B)
err_mat_c1 <- matrix(0,n+1,B)
err_mat_c01 <- matrix(0,n+1,B)
err_mat_c001 <- matrix(0,n+1,B)
errbar_mat_c1 <- matrix(0,n+1,B)
errbar_mat_c01 <- matrix(0,n+1,B)
errbar_mat_c001 <- matrix(0,n+1,B)
for(b in 1:B){
  X <- matrix(rnorm(n*p), n, p)%*%diag((1:5))
  Y <- X %*% theta_true + rnorm(n)
  th <- asgd_linear(X,Y,c=0.3)
  th001 <- asgd_linear(X,Y,c=0.01)
  th01 <- asgd_linear(X,Y,c=0.1)
  newt=Newton_linear(X,Y)
  err_newton[,b] <- rowSums((newt - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  err_mat_c1[,b] <- rowSums((th$theta - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  err_mat_c01[,b] <- rowSums((th01$theta - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  err_mat_c001[,b] <- rowSums((th001$theta - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  errbar_mat_c1[,b] <- rowSums((th$thetabar - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  errbar_mat_c01[,b] <- rowSums((th01$thetabar - matrix(theta_true,n+1,p,byrow=TRUE))^2)
  errbar_mat_c001[,b] <- rowSums((th001$thetabar - matrix(theta_true,n+1,p,byrow=TRUE))^2)
}
df <- data.frame(
  i = seq_len(nrow(err_mat_c1)),

  Newton   = rowMeans(err_newton),

  SGD_c1   = rowMeans(err_mat_c1),
  SGD_c01  = rowMeans(err_mat_c01),
  SGD_c001 = rowMeans(err_mat_c001),

  ASGD_c1   = rowMeans(errbar_mat_c1),
  ASGD_c01  = rowMeans(errbar_mat_c01),
  ASGD_c001 = rowMeans(errbar_mat_c001)
)

# Passage en format long
df_long <- df %>%
  pivot_longer(
    cols = -i,
    names_to = "method_c",
    values_to = "error"
  ) %>%
  mutate(
    method = case_when(
      method_c == "Newton"        ~ "Newton",
      grepl("^SGD", method_c)     ~ "SGD",
      grepl("^ASGD", method_c)    ~ "ASGD"
    ),
    c = case_when(
      method_c == "Newton" ~ "Newton",
      grepl("c1$", method_c)   ~ "c = 1",
      grepl("c01$", method_c)  ~ "c = 0.1",
      grepl("c001$", method_c) ~ "c = 0.01"
    )
  ) %>%
  mutate(
    method = factor(method, levels = c("Newton", "SGD", "ASGD")),
    c = factor(c, levels = c("Newton", "c = 1", "c = 0.1", "c = 0.01"))
  )
ggplot(df_long, aes(x = i, y = error,
                    color = method, linetype = c)) +
  geom_line(linewidth = 1) +
  scale_y_log10() +
  scale_x_log10() +
  scale_linetype_manual(
    values = c(
      "Newton" = "solid",
      "c = 1" = "dashed",
      "c = 0.1" = "dotdash",
      "c = 0.01" = "dotted"
    )
  ) +
  labs(
    x = "n",
    y = "Erreur quadratique moyenne",
    color = "",
    linetype = ""
  ) +
  theme_minimal(base_size = 14)

TP R - Algorithmes de gradient stochastiques — Corrigé

Objectifs du TP

1. Estimateur moyenné

2. Algorithme de Newton