islr_ch6.html

<!DOCTYPE html>
<html>
<head>

    <!-- Document Settings -->
    <meta charset="utf-8" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
	
	<!-- On Post front-matter YAML, set "use_math: true" to use LaTex -->
	
	  
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
    TeX: {
        equationNumbers: {
        autoNumber: "AMS"
        }
    },
    tex2jax: {
    inlineMath: [ ['$', '$'], ["\\(","\\)"]  ],
    displayMath: [ ['$$', '$$'], ["\\[","\\]"]  ],
    processEscapes: true,
    }
});
MathJax.Hub.Register.MessageHook("Math Processing Error",function (message) {
        alert("Math Processing Error: "+message[1]);
    });
MathJax.Hub.Register.MessageHook("TeX Jax - parse error",function (message) {
        alert("Math Processing Error: "+message[1]);
    });
</script>

<script type="text/javascript" async
    src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
	

    <!-- Base Meta -->
    <!-- dynamically fixing the title for tag/author pages -->


    <title>ISLR - Chapter 6. Linear Model Selection and Regularization</title>
    <meta name="HandheldFriendly" content="True" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <!-- Styles'n'Scripts -->
    <link rel="stylesheet" type="text/css" href="/assets/built/screen.css" />
    <link rel="stylesheet" type="text/css" href="/assets/built/screen.edited.css" />
    <link rel="stylesheet" type="text/css" href="/assets/built/syntax.css" />

    <!-- syntax.css -->
    <link rel="stylesheet" type="text/css" href="/assets/built/syntax.css" />
	
    <!-- highlight.js -->
    <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/default.min.css">
    <style>.hljs { background: none; }</style>

    <!--[if IE]>
        <style>
            p, ol, ul{
                width: 100%;
            }
            blockquote{
                width: 100%;
            }
        </style>
    <![endif]-->
    
    <!-- This tag outputs SEO meta+structured data and other important settings -->
    <meta name="description" content="" />
    <link rel="shortcut icon" href="http://0.0.0.0:4000/assets/built/images/favicon.jpg" type="image/png" />
    <link rel="canonical" href="http://0.0.0.0:4000/islr_ch6" />
    <meta name="referrer" content="no-referrer-when-downgrade" />

     <!--title below is coming from _includes/dynamic_title-->
    <meta property="og:site_name" content="Darron's Devlog" />
    <meta property="og:type" content="website" />
    <meta property="og:title" content="ISLR - Chapter 6. Linear Model Selection and Regularization" />
    <meta property="og:description" content="Chapter 6. Linear Model Selection and Regularization 6.1. Subset Selection 6.1.1. Best Subset Selection 6.1.2. Stepwise Selection Forward Stepwise Selection Backward Stepwise Selection Hybrid Approaches 6.1.3. Choosing the Optimal Model Validation and Cross-Validation 6.2. Shrinkage Methods 6.2.1. Ridge Regression in Singular Value Decomposition 6.2.2. The Lasso Another Formulation for Ridge" />
    <meta property="og:url" content="http://0.0.0.0:4000/islr_ch6" />
    <meta property="og:image" content="http://0.0.0.0:4000/assets/built/images/blog-cover1.png" />
    <meta property="article:publisher" content="https://www.facebook.com/" />
    <meta property="article:author" content="https://www.facebook.com/" />
    <meta property="article:published_time" content="2020-04-29T15:00:00+00:00" />
    <meta property="article:modified_time" content="2020-04-29T15:00:00+00:00" />
    <meta property="article:tag" content="ISLR" />
    <meta name="twitter:card" content="summary_large_image" />
    <meta name="twitter:title" content="ISLR - Chapter 6. Linear Model Selection and Regularization" />
    <meta name="twitter:description" content="Chapter 6. Linear Model Selection and Regularization 6.1. Subset Selection 6.1.1. Best Subset Selection 6.1.2. Stepwise Selection Forward Stepwise Selection Backward Stepwise Selection Hybrid Approaches 6.1.3. Choosing the Optimal Model Validation and Cross-Validation 6.2. Shrinkage Methods 6.2.1. Ridge Regression in Singular Value Decomposition 6.2.2. The Lasso Another Formulation for Ridge" />
    <meta name="twitter:url" content="http://0.0.0.0:4000/" />
    <meta name="twitter:image" content="http://0.0.0.0:4000/assets/built/images/blog-cover1.png" />
    <meta name="twitter:label1" content="Written by" />
    <meta name="twitter:data1" content="Darron's Devlog" />
    <meta name="twitter:label2" content="Filed under" />
    <meta name="twitter:data2" content="ISLR" />
    <meta name="twitter:site" content="@" />
    <meta name="twitter:creator" content="@" />
    <meta property="og:image:width" content="1400" />
    <meta property="og:image:height" content="933" />

    <script type="application/ld+json">
{
    "@context": "https://schema.org",
    "@type": "Website",
    "publisher": {
        "@type": "Organization",
        "name": "Darron's Devlog",
        "logo": "http://0.0.0.0:4000/"
    },
    "url": "http://0.0.0.0:4000/islr_ch6",
    "image": {
        "@type": "ImageObject",
        "url": "http://0.0.0.0:4000/assets/built/images/blog-cover1.png",
        "width": 2000,
        "height": 666
    },
    "mainEntityOfPage": {
        "@type": "WebPage",
        "@id": "http://0.0.0.0:4000/islr_ch6"
    },
    "description": "Chapter 6. Linear Model Selection and Regularization 6.1. Subset Selection 6.1.1. Best Subset Selection 6.1.2. Stepwise Selection Forward Stepwise Selection Backward Stepwise Selection Hybrid Approaches 6.1.3. Choosing the Optimal Model Validation and Cross-Validation 6.2. Shrinkage Methods 6.2.1. Ridge Regression in Singular Value Decomposition 6.2.2. The Lasso Another Formulation for Ridge"
}
    </script>

    <!-- <script type="text/javascript" src="https://demo.ghost.io/public/ghost-sdk.min.js?v=724281a32e"></script>
    <script type="text/javascript">
    ghost.init({
    	clientId: "ghost-frontend",
    	clientSecret: "f84a07a72b17"
    });
    </script> -->

    <meta name="generator" content="Jekyll 3.6.2" />
    <link rel="alternate" type="application/rss+xml" title="ISLR - Chapter 6. Linear Model Selection and Regularization" href="/feed.xml" />


</head>
<body class="post-template">

    <div class="site-wrapper">
        <!-- All the main content gets inserted here, index.hbs, post.hbs, etc -->
        <!-- default -->

<!-- The tag above means: insert everything in this file
into the {body} of the default.hbs template -->

<header class="site-header outer">
    <div class="inner">
        <nav class="site-nav">
    <div class="site-nav-left">
        
            
                <a class="site-nav-logo" href="/">Darron's Devlog</a>
            
        
            <ul class="nav" role="menu">
    <li class="nav-home" role="menuitem"><a href="/">Home</a></li>
    <li class="nav-about" role="menuitem"><a href="/about/">About</a></li>
    <li class="nav-projects" role="menuitem"><a href="/tag/projects/">Projects</a></li>
    <li class="nav-studies" role="menuitem"><a href="/tag/studies/">Studies</a></li>
	<li class="nav-blog" role="menuitem"><a href="/tag/blog/">Blog</a></li>
    <li class="nav-archive" role="menuitem">
        <a href="/archive.html">All Posts</a>
    </li>
</ul>
        
    </div>
    <div class="site-nav-right">
        <div class="social-links">
            
            
        </div>
        
            <a class="subscribe-button" href="#subscribe">Search</a>
        
    </div>
</nav>

    </div>
</header>

<!-- Everything inside the #post tags pulls data from the post -->
<!-- #post -->

<main id="site-main" class="site-main outer" role="main">
    <div class="inner">

        <article class="post-full  tag-islr  no-image">

            <header class="post-full-header">
                <section class="post-full-meta">
                    <time class="post-full-meta-date" datetime="29 April 2020">29 April 2020</time>
                    
                        <span class="date-divider">/</span>
                        
							
                               <a href='/tag/islr/'>ISLR</a>
                            
                        
                </section>
                <h1 class="post-full-title">ISLR - Chapter 6. Linear Model Selection and Regularization</h1>
            </header>
	<!--
            
	-->
            <section class="post-full-content">
                <div class="kg-card-markdown">
                    <ul id="markdown-toc">
  <li><a href="#chapter-6-linear-model-selection-and-regularization" id="markdown-toc-chapter-6-linear-model-selection-and-regularization">Chapter 6. Linear Model Selection and Regularization</a></li>
  <li><a href="#61-subset-selection" id="markdown-toc-61-subset-selection">6.1. Subset Selection</a>    <ul>
      <li><a href="#611-best-subset-selection" id="markdown-toc-611-best-subset-selection">6.1.1. Best Subset Selection</a></li>
      <li><a href="#612-stepwise-selection" id="markdown-toc-612-stepwise-selection">6.1.2. Stepwise Selection</a>        <ul>
          <li><a href="#forward-stepwise-selection" id="markdown-toc-forward-stepwise-selection">Forward Stepwise Selection</a></li>
          <li><a href="#backward-stepwise-selection" id="markdown-toc-backward-stepwise-selection">Backward Stepwise Selection</a></li>
          <li><a href="#hybrid-approaches" id="markdown-toc-hybrid-approaches">Hybrid Approaches</a></li>
        </ul>
      </li>
      <li><a href="#613-choosing-the-optimal-model" id="markdown-toc-613-choosing-the-optimal-model">6.1.3. Choosing the Optimal Model</a>        <ul>
          <li><a href="#validation-and-cross-validation" id="markdown-toc-validation-and-cross-validation">Validation and Cross-Validation</a></li>
        </ul>
      </li>
    </ul>
  </li>
  <li><a href="#62-shrinkage-methods" id="markdown-toc-62-shrinkage-methods">6.2. Shrinkage Methods</a>    <ul>
      <li><a href="#621-ridge-regression" id="markdown-toc-621-ridge-regression">6.2.1. Ridge Regression</a>        <ul>
          <li><a href="#in-singular-value-decomposition" id="markdown-toc-in-singular-value-decomposition">in Singular Value Decomposition</a></li>
        </ul>
      </li>
      <li><a href="#622-the-lasso" id="markdown-toc-622-the-lasso">6.2.2. The Lasso</a>        <ul>
          <li><a href="#another-formulation-for-ridge-regression-and-the-lasso" id="markdown-toc-another-formulation-for-ridge-regression-and-the-lasso">Another Formulation for Ridge Regression and the Lasso</a></li>
          <li><a href="#a-simple-special-case" id="markdown-toc-a-simple-special-case">A Simple Special Case</a></li>
          <li><a href="#bayesian-interpretation" id="markdown-toc-bayesian-interpretation">Bayesian Interpretation</a></li>
        </ul>
      </li>
    </ul>
  </li>
  <li><a href="#63-dimension-reduction-methods" id="markdown-toc-63-dimension-reduction-methods">6.3. Dimension Reduction Methods</a>    <ul>
      <li><a href="#631--principal-components-regression" id="markdown-toc-631--principal-components-regression">6.3.1.  Principal Components Regression</a>        <ul>
          <li><a href="#principal-components-analysis" id="markdown-toc-principal-components-analysis">Principal Components Analysis</a></li>
          <li><a href="#the-principal-components-regression-approach" id="markdown-toc-the-principal-components-regression-approach">The Principal Components Regression Approach</a></li>
        </ul>
      </li>
      <li><a href="#632-partial-least-squares" id="markdown-toc-632-partial-least-squares">6.3.2. Partial Least Squares</a></li>
    </ul>
  </li>
  <li><a href="#64-considerations-in-high-dimensions" id="markdown-toc-64-considerations-in-high-dimensions">6.4. Considerations in High Dimensions</a>    <ul>
      <li><a href="#641-high-dimensional-data" id="markdown-toc-641-high-dimensional-data">6.4.1. High-Dimensional Data</a></li>
      <li><a href="#642-what-goes-wrong-in-high-dimensions" id="markdown-toc-642-what-goes-wrong-in-high-dimensions">6.4.2. What Goes Wrong in High Dimensions?</a></li>
      <li><a href="#643-regression-in-high-dimensions" id="markdown-toc-643-regression-in-high-dimensions">6.4.3. Regression in High Dimensions</a></li>
      <li><a href="#644-interpreting-results-in-high-dimensions" id="markdown-toc-644-interpreting-results-in-high-dimensions">6.4.4. Interpreting Results in High Dimensions</a></li>
    </ul>
  </li>
</ul>

<h2 id="chapter-6-linear-model-selection-and-regularization">Chapter 6. Linear Model Selection and Regularization</h2>
<ul>
  <li>Limitations of LSE
    <ol>
      <li>Prediction Accuracy:
        <ul>
          <li>if <em>n</em> is not much larger than <em>p</em>, the least squares fit can have a lot 
 of variability, results in overfitting and poor predictions to test data.</li>
          <li>if <em>p</em> &gt; <em>n</em>, there is no unique solution for the least squares coefficient 
 estimate; as $ Var(\hat\beta)=\infty$.</li>
          <li>if <em>p</em> is large, there can be correlations between <em>X</em> variables. A model 
 having multicollinearity can have high variance.<br />
<em>Constraining</em> or <em>Shrinking</em> the estimated coefficients can reduce the variance 
with negligible increase in bias, and improve in the accuracy to the test data.</li>
        </ul>
      </li>
      <li>Model Interpretability:
        <ul>
          <li>There are irrelevant variables $X_j$. Removing by setting coefficient estimates 
 $\beta_j = 0$, we can have more interpretability.<br />
<em>Feature selection</em> or <em>Variable selection</em> can exclude irrelevant variables from a 
multiple regression model.</li>
        </ul>
      </li>
    </ol>
  </li>
</ul>

<h2 id="61-subset-selection">6.1. Subset Selection</h2>

<h3 id="611-best-subset-selection">6.1.1. Best Subset Selection</h3>
<ul>
  <li>
    <p>fit a separate least squares regression for all $2^p$ possible models with combinations 
of the <em>p</em> predictors.</p>
  </li>
  <li>Algorithm
    <ol>
      <li>$\mathcal{M}_0$ as <em>null model</em> (i.e., $ Y = \beta_0 + \epsilon $)</li>
      <li>For $ k = 1, 2, \ldots, p $:<br />
  (a) Fit all \({p \choose k}\) models with <em>k</em> predictors<br />
  (b) Pick the smallest RSS, (or largest $R^2$) = $ \mathcal{M}_k $</li>
      <li>Select best model among $\mathcal{M}_0, \ldots,\mathcal{M}_p$ using cross-validated 
  prediction error, $C_p$ (AIC), BIC, or adjusted $R^2$</li>
    </ol>
  </li>
  <li>
    <p>Guarantees the best selection, while it suffers from computational limitations. Also, it 
only works for least squares linear regression.</p>
  </li>
  <li>in the case of logistic regression, we use <em>deviance</em>, $-2\log$MLE, instead of RSS in 
the 2nd step of algorithm upon.</li>
</ul>

<h3 id="612-stepwise-selection">6.1.2. Stepwise Selection</h3>

<h4 id="forward-stepwise-selection">Forward Stepwise Selection</h4>
<ul>
  <li>Algorithm
    <ol>
      <li>$\mathcal{M}_0$ as <em>null model</em></li>
      <li>For $ k = 1, 2, \ldots, p $:<br />
  (a) Fit all <em>p - k</em> models in \(\mathcal{M}_k\) with one additional predictor<br />
  (b) Pick the smallest RSS among <em>p - k</em> models, $\mathcal{M}_{k+1}$</li>
      <li>Select best model among $\mathcal{M}_0, \ldots,\mathcal{M}_p$ with CV scores</li>
    </ol>
  </li>
  <li>Total $\frac{p(p+1)}{2}+1$ possible models. No guarantee but available for the case of 
high dimensional data($n&lt;p$).</li>
</ul>

<h4 id="backward-stepwise-selection">Backward Stepwise Selection</h4>
<ul>
  <li>Algorithm
    <ol>
      <li>$\mathcal{M}_p$ as <em>full model</em>, contains all <em>p</em> predictors</li>
      <li>For $ k = p, p-1, \ldots, 1 $:<br />
  (a) Fit all <em>k - 1</em> models contain all but one of the predictors in \(\mathcal{M}_k\)<br />
  (b) Pick the smallest RSS among <em>k - 1</em> models, $\mathcal{M}_{k-1}$</li>
      <li>Select best model among $\mathcal{M}_0, ldots,\mathcal{M}_p$ with CV scores</li>
    </ol>
  </li>
  <li>Total $\frac{p(p+1)}{2}+1$ possible models. No guarantee and not for <em>n &lt; p</em> case.</li>
</ul>

<h4 id="hybrid-approaches">Hybrid Approaches</h4>
<ul>
  <li>add then remove one predictors in each step.</li>
</ul>

<h3 id="613-choosing-the-optimal-model">6.1.3. Choosing the Optimal Model</h3>
<ul>
  <li>A model containing all of the predictors will always have the smallest RSS and the largest 
$R^2$, since these quantities are related to the training error. Instead, we need a model with a 
low test error.</li>
</ul>

<ol>
  <li>
    <p>$C_p = \frac{1}{n}(RSS + 2 d \hat\sigma^2)$<br />
 For a fitted least squares model, with <em>d</em> as the number of predictors and $\hat\sigma^2$ as 
 an estimate of the variance of the error. Typically $\hat\sigma^2$ is estimated using the full 
 model containing all predictors. Adding a penalty to the training RSS is to adjust its 
 underestimation to the test error. As the number of predictors increase, the penalty increase. 
 If there is a proof of $\hat\sigma^2$ is an unbiased estimate of $\sigma^2$, $C_p$ is an unbiased 
 estimate of test MSE. Then, a model with the lowest $C_p$ is the best model.</p>
  </li>
  <li>
    <p>AIC $= \frac{1}{n}(RSS + 2 d \hat\sigma^2)$<br />
 For a models fit by maximum likelihood(MLE), given by omitted irrelevant constants. $C_p$ and 
 AIC are proportional to each other.</p>
  </li>
  <li>
    <p>BIC $= \frac{1}{n}(RSS + \log(n)d\hat\sigma^2)$<br />
 From a Bayesian point of view, for a fitted least squares model. Also given by omitted 
 irrelevant constants. BIC has heavier penalty then $C_p$ or AIC, results in selecting smaller 
 models.</p>
  </li>
  <li>
    <p>Adjusted $R^2 = 1 - \frac{RSS/(n-d-1)}{TSS/(n-1)}$<br />
 Since the usual $R^2$ is defined as $1 - RSS/TSS$, it always increases as more variables added. 
 Adjusted $R^2$ gives penalty of <em>d</em>, the number of predictors in the denominator. Unlike other 
 statistics, a large value of adjusted $R^2$ indicates a small test error.</p>
  </li>
</ol>

<h4 id="validation-and-cross-validation">Validation and Cross-Validation</h4>
<ul>
  <li><em>one-standard-error rule</em><br />
First calculate the standard error of the estimated test MSE for each model size, then select the 
smallest model for which the estimated test error is within one standard error of the lowest point 
on the curve.<br />
If a set of models appear to be more or less equally good, then we might as well choose the simplest 
model; the model with the smallest number of predictors.</li>
</ul>

<h2 id="62-shrinkage-methods">6.2. Shrinkage Methods</h2>

<h3 id="621-ridge-regression">6.2.1. Ridge Regression</h3>
<ul>
  <li>
    <p>Ridge regression coefficient estimates<br />
\(\begin{align*}
\hat\beta^R &amp;= \text{min}_{\beta}\left[
                  \underbrace{\sum_{i=1}^n(y_i-\beta_0-\sum_{j=1}^p \beta_j x_{ij})}_{RSS}
                  + \lambda\sum_{j=1}^p \beta_j^2 \right] \\
            &amp;= (X^TX + \lambda I)^{-1} X^T\underline{y}	
\end{align*}\)</p>
  </li>
  <li>
    <p>$\lambda \ge 0 $ is a <em>tuning parameter</em>, $\lambda\sum_{j=1}^p \beta_j^2$ is a <em>shrinkage penalty</em>. 
The penalty is small when the coefficients are close to zero, and so it has the effect of <em>shrinking</em> 
the estimates of $\beta_j$ towards zero. Ridge regression will produce a different set of coefficient 
estimates $\beta_{\lambda}^R$, for each value of $\lambda$.</p>
  </li>
  <li>
    <p>We do not want to shrink the intercept $\beta_0$, which is simply a measure of the mean value of 
the response when $x_{i1}=x_{i2}=\ldots=x_{ip}=0$. If the variables, the columns of the data matrix
<strong>$X$</strong>, have been centered to have mean zero before ridge regression is performed, then the estiamted 
intercept will take the form $\hat\beta_0 = \bar{y} = \sum_{i=1}^n y_i/n$.</p>
  </li>
  <li>
    <p>The standard least squares coefficient estimates are <em>scale equivariant</em>; multiplying $X_j$ by a constant 
<em>c</em> leads to a scaling of the least squares coefficient estimates by a factor of 1/<em>c</em>. I.e., regardless 
of how the <em>j</em>th predictor is scaled, $X_j\hat\beta_j$ will remain the same.<br />
In contrast, the ridge regression coefficient estimates can change substantially when multiplying a 
given predictor by a constant. The value of $X_j\hat\beta_{j,\lambda}^R$ may depend on the scaling of 
the other predictors. Thus, before applying ridge regression, the variables need to be standardized to 
have a standard deviation of one.<br />
The formula: \(\tilde{x}_{ij}=\frac{x_{ij}}{\sqrt{\frac{1}{n}\sum_{i=1}^n(x_{ij}-\bar{x}_j)^2}}\)</p>
  </li>
  <li>
    <p>Ridge regression overperforms the standard least squares when the number of variables <em>p</em> is almost 
as large as the number of observations <em>n</em>, or even when $p &gt; n$. Also it has computational advantages 
over best subset selection, which requires searching through $2^p$ models. Ridge regression only fits a 
single model for any fixed value of $\lambda$.</p>
  </li>
</ul>

<h4 id="in-singular-value-decomposition">in Singular Value Decomposition</h4>
<ul>
  <li>where $ X = \mathbb{UDV}^T$,<br />
\(\begin{align*} X\hat\beta^{\text{LSE}} &amp;= X(X^TX)^{-1}X^T\underline{y} \\
                                         &amp;= \mathbb{UU}^T\underline{y} \\
                            X\hat\beta^R &amp;= UD(D^2 + \lambda I)^{-1}DU^T\underline{y} \\
                                         &amp;= \sum_{j=1}^p\underline{u}_j\frac{d_{ij}^2}{d_{ij}^2+\lambda}\underline{u}_j^T\underline{y}
\end{align*}\)<br />
\(\begin{align*}
\rightarrow \partial f(\lambda) &amp;= tr[X(X^TX + \lambda I)^{-1} X^T] \\
                                &amp;= tr(\mathbb{H}_{\lambda})  \\
                                &amp;= \sum_{j=1}^p\frac{d_{ij}^2}{d_{ij}^2+\lambda}
\end{align*}\)</li>
</ul>

<h3 id="622-the-lasso">6.2.2. The Lasso</h3>
<ul>
  <li>
    <p>Ridge regression estimates shrink towards zero but will not set nay of them exactly to zero(unless 
$\lambda = \infty$). This may not be a problem for prediction accuracy, but it can be a challenge in 
model interpretation when <em>p</em> is quite large.</p>
  </li>
  <li>
    <p>The <em>lasso</em><br />
\(\hat\beta^L_{\lambda} = \text{min}_{\beta}\left[RSS+\lambda\sum_{j=1}^p|\beta_j|\right]\)<br />
Instead of $\mathcal{l}_2$ penalty in Ridge, the lasso uses an $\mathcal{l}_1$ penalty. 
The $\mathcal{l}_1$ norm of a coefficient vector $\beta$ is given by $\lVert \beta \rVert_1 = 
\sum |\beta_j|$. This penalty has the effect of forcing some of the coefficient estimates to be 
exactly equal to zero when the tuning parameter is sufficiently large. Hence, the lasso performs 
<em>variable selection</em>, these <em>sparse</em> models with the lasso are much easier to interpret than those 
with ridge.</p>
  </li>
</ul>

<h4 id="another-formulation-for-ridge-regression-and-the-lasso">Another Formulation for Ridge Regression and the Lasso</h4>
<ul>
  <li>
    <p>Ridge:
\(\text{min}_{\beta}\left\{ \sum_{i=1}^n(y_i-\beta_0-\sum_{j=1}^p\beta_j x_{ij})^2 
                      \right\}\) subject to $\sum_{j=1}^p\beta_j^2 \le s $<br />
Lasso:
\(\text{min}_{\beta}\left\{ \sum_{i=1}^n(y_i-\beta_0-\sum_{j=1}^p\beta_j x_{ij})^2 
                      \right\}\) subject to $\sum_{j=1}^p|\beta_j| \le s $<br />
where the <em>budget s</em> as the regularization parameter ($\lambda\uparrow \equiv s\downarrow$).</p>
  </li>
  <li>
    <p>when $p = 2$, then the ridge regression estimates have the smallest RSS out of all points that lie 
within the circle defined by $\beta_1^2 + \beta_2^2 \le s$, while the lasso estimates have within 
the diamond defined by $|\beta_1|+|\beta_2| \le s$. when $p = 3$, he constraint region for ridge 
becomes a sphere, for lasso becomes a polyhedron. For larger <em>p</em>, it becomes a hypersphere and a 
polytope each. The lasso leads to feature selection due to the sharp corners of its constraint region.</p>
  </li>
  <li>
    <p>the number of predictors that is related to the response is never known a <em>priori</em> for real data sets. 
A technique such as cross-validation can be used in order to determine which approach is better on a 
particular data set.</p>
  </li>
</ul>

<h4 id="a-simple-special-case">A Simple Special Case</h4>
<ul>
  <li>
    <p>An analytical method(solution) for the case when $n = p$, and <strong><em>X</em></strong> a diagonal matrix with 1’s on 
the diagonal and 0’s in all off-diagonal elements. I.e., the columns of <strong><em>X</em></strong> are orthogonal. Also, 
assume that we are performing regression without an intercept(or standardized).<br />
(c.f. in real world cases, we need to use numerical methods.)</p>
  </li>
  <li>
    <p>The usual least squares, $\hat\beta$ is that minimizes; $\sum_{j=1}^p(y_j-\beta_j)^2$.<br />
and for the ridge, minimizing $\sum_{j=1}^p(y_j-\beta_j)^2+\lambda\sum_{j=1}^p\beta_j^2$.<br />
and for the lasso, minimizing $\sum_{j=1}^p(y_j-\beta_j)^2+\lambda\sum_{j=1}^p|\beta_j|$.</p>
  </li>
  <li>
    <p>The ridge regression estiamtes $\hat\beta_j^R = y_j/(1+\lambda)$ and<br />
\(\text{the lasso estimates} \begin{align*}
\hat\beta_j^L &amp;= \text{sign}(\hat\beta_j)(|\hat\beta_j|-\lambda)_{+}, \\
    \text{or} &amp;= \begin{cases}
                  y_j - \lambda/2, &amp; \mbox{if }y_j &gt; \lambda/2; \\
                  y_j + \lambda/2, &amp; \mbox{if }y_j &lt; -\lambda/2; \\
                  0				 &amp; \mbox{if }|y_j| \le \lambda/2.
                  \end{cases}
\end{align*}\)<br />
<img src="/assets/images/ch6_ridge_lasso_effect_0.png" alt="png" width="70%" height="70%" /><br />
Ridge shrinks all coefficients towards zero by the same <em>“proportion”</em>,<br />
Lasso shrinks all coefficients towards zero by the same <em>“amount”</em>.</p>
  </li>
</ul>

<h4 id="bayesian-interpretation">Bayesian Interpretation</h4>
<ul>
  <li>$p(\beta|X,Y)\propto f(Y|X,\beta)p(\beta|X) = f(Y|X,\beta)p(\beta)$<br />
with assumption of $p(\beta)=\prod_{j=1}^p g(\beta_j)$ for some density function <em>g</em>.<br />
Two special cases of <em>g</em>:
    <ul>
      <li>If <em>g</em> is a Gaussian distribution with mean zero and standard deviation a function of $\lambda$, 
it follows that the <em>posterior mode</em> for $\beta$, is given by the ridge regression solution. Also, 
the solution is equal to posterior mean.</li>
      <li>If <em>g</em> is a double-exponential(Laplace) distribution with mean zero and scale parameter a function 
of $\lambda$, it follows that the posterior mode for $\beta$ is the lasso soultion(which is not the 
posteriror mean in this case).</li>
    </ul>
  </li>
  <li>Hence, the lasso expects a priori that many of the coefficients are (exactly) zero, while ridge 
assumes the coefficients are randomly distributed about zero.</li>
</ul>

<h2 id="63-dimension-reduction-methods">6.3. Dimension Reduction Methods</h2>
<ul>
  <li>
    <p><em>p</em> predictors to <em>M</em> new transformed variables.<br />
Let $Z_m = \sum_{j=1}^p\phi_{jm}X_j$ represent <em>M &lt; p linear combinations</em> of original <em>p</em> predictors. 
Then fit the linear regression model $y_i = \theta_0 + \sum_{m=1}^M\theta_m z_{im} + \epsilon_i, \quad i = 1, \ldots, n$, 
using least squares. If the constants $\phi_{1m}, \ldots, \phi_{pm}$ are chosen wisely, dimension 
reduction approaches can outperform least squares regression. I.e., using least squares, fitting 
reduced model can lead to better results than fitting the standard linear model.</p>
  </li>
  <li>
    <p>\(\sum_{m=1}^M\theta_m z_{im} = \sum_{m=1}^M\theta_m\sum_{j=1}^p\phi_{jm}x_{ij} = 
  \sum_{j=1}^p\sum_{m=1}^M\theta_m\phi_{jm}x_{ij} = \sum_{j=1}^p\beta_j x_{ij},\)<br />
  where \(\beta_j = \sum_{m=1}^M\theta_m\phi_{jm}\).<br />
  Hence, this model can be a special case of the standard linear regression model. In situations where 
  <em>p</em> is large relative to <em>n</em>, demension reduction methods can significantly reduce the variance of the 
  fitted coefficients. If $M = p$, and all the $Z_m$ are linearly independent, then there are no constraints 
  and the model is equivalent to the standard linear model.</p>
  </li>
  <li>
    <p>All dimension reduction methods work in two steps. First, the transformed predictors $Z_m$ are obtained. 
  Second, the model is fit using these <em>M</em> predictors. The choice of $Z_m$, which is, the selection of the 
  $\phi_{jm}$’s can be achieved in different ways.</p>
  </li>
</ul>

<h3 id="631--principal-components-regression">6.3.1.  Principal Components Regression</h3>

<h4 id="principal-components-analysis">Principal Components Analysis</h4>
<ul>
  <li>
    <p>Goal of PCA:<br />
  PCA is a technique for reducing the dimension of an <em>n by p</em> data matrix <strong><em>X</em></strong>, finding small number 
  of dimensions <em>M</em>, which have simillar amount of information to original <em>p</em> predictors.</p>
  </li>
  <li>
    <p>The <em>principal component</em> direction of the data is that along which the observations <em>vary the most</em>; 
  with the largest variance of the observations projected onto. The principal component vector $Z_m$ 
  defines the line that is <em>as close as possible</em> to the data, minimizing the sum of the squared 
  perpendicular distances between each point and the line. In other word, the principal component appears 
  to capture most of the information contained in two variables.</p>
  </li>
  <li>
    <p>e.g. in the first principal component,<br />
  <img src="/assets/images/ch6_pca_0.png" alt="png" width="70%" height="70%" /><br />
  total variance keeped: $Var(X_1)+Var(X_2) = Var(PC_1)+Var(PC_2)$</p>
  </li>
  <li>
    <p>where $X_s$ is $n \times p$ standardized matrix,<em>j</em>th Principal Component Vector of $X_s$: $z_j = X_s v_j$, 
  $\quad j=1,\ldots,p$ is that satisfying \(\text{max}_{\alpha}Var(X_s\alpha)\) subject to \(\lVert\alpha\rVert=1\). 
  Here, the values of $z_{1j}, \ldots, z_{nj}$ are known as the <em>principal component scores</em>.<br />
  $v_j$ is $p \times 1$ size eigenvector of $X_s^T X_s$ corresponding to the <em>j</em>th largest eigenvalue, 
  and $\alpha$ is $v_j$’s orthogonality to $v_1,\ldots,v_{j-1}$ ($\alpha^T S v_k = 0$, where S is the 
  sample covariance matrix of $X_s$, or $X_s^T X_s$, and $k = 1, \cdots, j-1$).<br />
  Then $z_1 = X_s v_1$, $z_2\bot z_1$, $z_3\bot z_1,z_2$, $\cdots$, $z_p\bot z_1,\ldots,z_{p-1}$.</p>
  </li>
  <li>
    <p><em>derivation</em><br />
  Since $X_s$ is standardized matrix,<br />
  \(Var(X_s\alpha) = \alpha^T X_s^T X_s\alpha\)<br />
  <em>by Lagrangian form</em>,<br />
  \(\begin{align*}
  \text{max}_{\alpha}Q(X_s,\lambda) &amp;= \text{max}_{alpha}\left[\alpha^T X_s^T X_s\alpha
                                                              -\lambda\alpha^T\alpha \right] \\
  \rightarrow \frac{\partial Q}{\partial\alpha} &amp;= 2X_s^T X\alpha - 2\lambda\alpha \\
  \text{for } \hat\alpha, X_s^T X\alpha &amp;= \lambda\alpha
  \end{align*}\)<br />
  <em>note that</em> $\mathbb{A}_v = ev$, the combination of eigenvalue and eigenvector of $\mathbb{A}$.<br />
  Thus, $\alpha = v_j$, the <em>j</em>th eigenvector of $X_s^T X_s$, that is, the constraint of orthogonality 
  is satisfied.</p>
  </li>
  <li>
    <p>Since PCA has no single solution <em>M</em>;<br />
  the proportion of variance explained by <em>m</em>th PC($Z_m$) used:<br />
  \(PVE_m = \frac{Var(Z_m)}{\sum_{j=1}^p(Var(Z_j))}\)<br />
  (\(\sum_{j=1}^p(Var(Z_j)) = \sum Var(X_j) =\) total variance)</p>
  </li>
  <li>
    <p>in <em>SVD</em> of covariance matrix $X^T X$,<br />
  \(\begin{align*}
  X^T X &amp;= \mathbb{VDU}^T\mathbb{UDV}^T \\
        &amp;= \mathbb{VD^2 V}^T
  \end{align*}\)<br />
  in this eigen decomposition,<br />
  \(\mathbb{V} = (v_1,\ldots,v_p)\) the eigen vectors of $X^T X$<br />
  \(\mathbb{D}^2 = \begin{bmatrix}
                      d_1^2 &amp; \cdots &amp; 0 \\
                      \vdots &amp; \ddots &amp; \vdots \\
                      0 &amp; \cdots &amp; d_p^2
                      \end{bmatrix}\)
                      $d_j^2 = e_j$, <em>j</em>th eigenvalue of $X^T X$<br />
  thus,<br />
  \(\begin{align*}
  Var(Z_m) &amp;= \frac{1}{n}(Z_m^T Z_m) \\
           &amp;= \frac{1}{n}(v_m^T X_s^T X_s v_m) \\
           &amp;= \frac{1}{n}(v_m^T\mathbb{VD}^2\mathbb{V}^T v_m) \\
           &amp;= \frac{1}{n}d_m^2 = \frac{1}{n}e_m
  \end{align*}\)</p>
  </li>
  <li>
    <p>Therefore,<br />
  \(PVE_m = \frac{Var(Z_m)}{\sum_{j=1}^p(Var(Z_j))} = \frac{e_m}{\sum_{j=1}^p e_j}\)<br />
  we can draw a <em>scree plot</em> on the value of $PVE_m$ over the value of <em>m</em> to find optimal “M”.</p>
  </li>
</ul>

<h4 id="the-principal-components-regression-approach">The Principal Components Regression Approach</h4>
<ul>
  <li>
    <p>The key idea is that a small number of principal components can explain most of the variability in the 
  data, as well as the relationship with the response. Under this assumption, fitting a least squares model 
  to $Z_1, \ldots, Z_M$ will lead to better results than fitting a least squares model to $X_1, \ldots, X_p$, 
  since most or all of the information in the data is contained in $Z_m$ and there are smaller number of 
  coefficients, we can mitigate overfitting.</p>
  </li>
  <li>
    <p>Note that PCR is not a feature selection method; is a linear combination of all <em>p</em> of the original features. 
  In this sense, PCR is more closely related to ridge regression than to the lasso.</p>
  </li>
  <li>
    <p>Deciding “M”:<br />
  full model is \(\hat{Y} = \hat{\theta}_0 + \hat{\theta}_1 Z_1 + \cdots + \hat{\theta}_p Z_p\)<br />
  when $Z_1,\ldots,Z_m$ is from standardized $X_s$ and \(\hat{y}_0 = \bar{y}\),<br />
  as $Z_j$’s are orthogonal, adding variable $Z_{j+1}$ does not affect the coefficients. Thus, $\theta_j$’s are 
  not changed by feature selection; that is,<br />
  \(\hat{Y} = \hat{\theta}_0 + \hat{\theta}_1 Z_1 \\
  \hat{Y} = \hat{\theta}_0 + \hat{\theta}_1 Z_1 + \hat{\theta}_2 Z_2 \\
  \vdots \\
  \hat{Y} = \hat{\theta}_0 + \hat{\theta}_1 Z_1 +\cdots + \hat{\theta}_p Z_p\) the value of $\theta_k$ is the same.<br />
  Then we can use CV methods over these models to get optimal <em>M</em>.</p>
  </li>
</ul>

<h3 id="632-partial-least-squares">6.3.2. Partial Least Squares</h3>
<ul>
  <li>
    <p>The PCR approach identifies linear combinations, or <em>directions</em>, that best represent the predictors. 
  These directions are identified in an <em>unsupervised</em> way, since the response <em>Y</em> is not used to help 
  determine the principal component directions. There, PCR suffers from a drawback: there is no guarantee 
  that the directions that best explain the predictors will also be the best directions to use for 
  predicting the response.</p>
  </li>
  <li>
    <p>PLS is a <em>supervised</em> alternative to PCR; finding PLS directions $Z_1,\ldots,Z_m$ that 
  $Cov(Y,Z_1)\ge Cov(Y,Z_2)\ge\cdots\ge Cov(Y,Z_M)$ instead of $Var(Z_1)\ge\cdots\ge Var(Z_M)$.</p>
  </li>
  <li>
    <p>First PLS direction is computed, after standardizing predictors, by setting each $\phi_{j1}$ equal to 
  the coefficient from the simple linear regression $Y$ onto $X_j$. As $Z_1 = \sum_{j=1}^p\phi_{j1}X_j$, 
  PLS places the highest weight on the variables that are most strongly related to the response.<br />
  To find second PLS direction, we adjust each of the variables for $Z_1$, by regressing each variable 
  on $Z_1$ and taking <em>residuals</em>. The residuals can be interpreted as the remaining information that has 
  not been explained by the first PLS direction. Then we compute $Z_2$ using this orthogonalized data by 
  the same way of computing $Z_1$. This predecure repeated <em>M</em> times.</p>
  </li>
  <li>
    <p>in Simple Regression case,<br />
  $\hat X_j^s$ is a projection of original data $X_j^s$ to a vector $Z_1$; $X_j^s = \alpha Z_1$.<br />
  the residual vector $r_j = \hat X_j^s - X_j^s$ and $r_j\bot Z_1$.<br />
  Then, $r_j = X_j^{(2)}$ is the orthogonalized data for computing the next $Z_2$.</p>
  </li>
  <li>
    <p>The <em>m</em>th PLS direction:<br />
  \(\text{max}_{\phi} Cov(y,X_s\phi)\) subject to $\lVert\phi\rVert = 1$, $\phi^T S v_l = 0$<br />
  for $\phi$ as orthogonal directions, sample covariance matrix <em>S</em>, and $v_l$ as <em>l</em> th PLS direction.<br />
  \(\text{max}_{\phi}[E(\phi^T X_s^T y)-E(y)E(\phi^T X_s)]\), as standardized, $E(X_s) = 0$,<br />
  \(\equiv \text{max}_{\phi}\phi^T \dot X_s^T y\) is maximization of dot product of 2 vectors.<br />
  note that, when two vectors are in the same direction, dot product is maximized.<br />
  $\therefore \phi=X_s^T y$.</p>
  </li>
</ul>

<p><img src="/assets/images/ch6_pls_algorithm_0.png" alt="png" width="80%" height="80%" /></p>

<h2 id="64-considerations-in-high-dimensions">6.4. Considerations in High Dimensions</h2>

<h3 id="641-high-dimensional-data">6.4.1. High-Dimensional Data</h3>
<ul>
  <li>Data sets that containing more features than observations, $p &gt; n$.</li>
</ul>

<h3 id="642-what-goes-wrong-in-high-dimensions">6.4.2. What Goes Wrong in High Dimensions?</h3>
<ul>
  <li>Standard least squares cannot be performed. Regardless of the true relationship between features and response, 
  least squares will result in a perfect fit to the data, lead to overfitting of the data and poor 
  predictions.</li>
</ul>

<h3 id="643-regression-in-high-dimensions">6.4.3. Regression in High Dimensions</h3>
<ul>
  <li>new technologies that allow for the collection of measurements for thousands or millions of features 
  are a double-edged sword: they can lead to improved predictive models if these features are in fact 
  relevant to the problem at hand, but will lead to worse results if the features are not relevant. 
  Even if they are relevant, the variance incurred in fitting their coefficients may outweigh the 
  reduction in bias that they bring.</li>
</ul>

<h3 id="644-interpreting-results-in-high-dimensions">6.4.4. Interpreting Results in High Dimensions</h3>
<ul>
  <li>
    <p>In high-dimensional setting, the multicollinearity problem is extreme:<br />
  any variable in the model is a linear combination of all of the other variables in the model. This means 
  we can never know exactly which variables truly are predictive of the outcome, and we can never identify 
  the best coefficients for use in the regression.</p>
  </li>
  <li>
    <p>When $p &gt; n$, it is easy to obtain a a useless model that has zero residuals. Therefore, we should never 
  use sum of squared errors, p-values, $R^2$ statistics, or other traditional measures of model fit on the 
  training data as evidence of a good model fit. Instead we report results on an independent test set, or 
  cross-validation errors.</p>
  </li>
</ul>

                </div>
            </section>

            <!-- Email subscribe form at the bottom of the page -->
	<!--
            
                <section class="subscribe-form">
                    <h3 class="subscribe-form-title">Subscribe to Darron's Devlog</h3>
                    <p>Get the latest posts delivered right to your inbox</p>
                    <span id="searchform" method="post" action="/subscribe/" class="">
    <input class="confirm" type="hidden" name="confirm"  />
    <input class="location" type="hidden" name="location"  />
    <input class="referrer" type="hidden" name="referrer"  />

    <div class="form-group">
        <input class="subscribe-email" onkeyup="myFunc()" 
               id="searchtext" type="text" name="searchtext"  
               placeholder="Search..." />
    </div>
    <script type="text/javascript">
        function myFunc() {
            if(event.keyCode == 13) {
                var url = encodeURIComponent($("#searchtext").val());
                location.href = "/search.html?query=" + url;
            }
        }
    </script>
</span>
                </section>
            
	-->
            <footer class="post-full-footer">
                <!-- Everything inside the #author tags pulls data from the author -->
                <!-- #author-->
                
                    
                <!-- /author  -->
            </footer>

            <!-- If you use Disqus comments, just uncomment this block.
            The only thing you need to change is "test-apkdzgmqhj" - which
            should be replaced with your own Disqus site-id. -->
            
                <section class="post-full-comments">
                    <div id="disqus_thread"></div>
                    <script>
                        var disqus_config = function () {
                            var this_page_url = 'http://0.0.0.0:4000/islr_ch6';
                            var this_page_identifier = '/islr_ch6';
                            var this_page_title = 'ISLR - Chapter 6. Linear Model Selection and Regularization';
                        };
                        (function() {
                            var d = document, s = d.createElement('script');
                            s.src = 'https://.disqus.com/embed.js';
                            s.setAttribute('data-timestamp', +new Date());
                            (d.head || d.body).appendChild(s);
                        })();
                    </script>
                </section>
            

        </article>

    </div>
</main>

<!-- Links to Previous/Next posts -->
<aside class="read-next outer">
    <div class="inner">
        <div class="read-next-feed">
            
                
                    <article class="read-next-card"
                        
                            style="background-image: url(/assets/built/images/blog-cover1.png)"
                        
                    >
                        <header class="read-next-card-header">
                            <small class="read-next-card-header-sitetitle">&mdash; Darron's Devlog &mdash;</small>
                            
                                <h3 class="read-next-card-header-title"><a href="/tag/islr/">Islr</a></h3>
                            
                        </header>
                        <div class="read-next-divider"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 14.5s2 3 5 3 5.5-2.463 5.5-5.5S21 6.5 18 6.5c-5 0-7 11-12 11C2.962 17.5.5 15.037.5 12S3 6.5 6 6.5s4.5 3.5 4.5 3.5"/></svg>
</div>
                        <div class="read-next-card-content">
                            <ul>
                                
                                
                                            <li><a href="/islr_ch10">ISLR - Chapter 10. Deep Learning</a></li>
                                        
                                    
                                            <li><a href="/islr_ch9">ISLR - Chapter 9. Support Vector Machines</a></li>
                                        
                                    
                                            <li><a href="/islr_ch8">ISLR - Chapter 8. Tree-Based Methods</a></li>
                                        
                                    
                            </ul>
                        </div>
                        <footer class="read-next-card-footer">
                            <a href="/tag/islr/">
                                
                                    See all 8 posts  →
                                
                            </a>
                        </footer>
                    </article>
                
            
            <!-- If there's a next post, display it using the same markup included from - partials/post-card.hbs -->
            
                    <article class="post-card post-template no-image">
        
        <div class="post-card-content">
            <a class="post-card-content-link" href="/islr_ch7">
                <header class="post-card-header">
                    
                        
                                <span class="post-card-tags">Islr</span>
                            
                        
                    <h2 class="post-card-title">ISLR - Chapter 7. Moving Beyond Linearity</h2>
                </header>
                <section class="post-card-excerpt">
                    
                        <p>Chapter 7. Moving Beyond Linearity 7.1. Polynomial Regression 7.2. Step Functions 7.3. Basis Functions 7.4. Regression Splines 7.4.1. Piecewise Polynomials 7.4.2. Constraints and Splines 7.4.3. The Spline Basis Representation 7.4.4. Choosing the Number</p>
                    
                </section>
            </a>
            <footer class="post-card-meta">
                
                    
            </footer>
        </div>
    </article>

            
            <!-- If there's a previous post, display it using the same markup included from - partials/post-card.hbs -->
            
                    <article class="post-card post-template no-image">
        
        <div class="post-card-content">
            <a class="post-card-content-link" href="/islr_ch5">
                <header class="post-card-header">
                    
                        
                                <span class="post-card-tags">Islr</span>
                            
                        
                    <h2 class="post-card-title">ISLR - Chapter 5. Resampling Methods</h2>
                </header>
                <section class="post-card-excerpt">
                    
                        <p>Chapter 5. Resampling Methods 5.1. Cross-Validation 5.1.1. The Validation Set Approach 5.1.2. Leave-One-Out Cross-Validation 5.1.3. k-Fold Cross-Validation 5.1.4. Bias-Variance Trade-Off for k-Fold Cross-Validation 5.1.5. Cross-Validation on Classification Problems 5.2. The Bootstrap Chapter 5.</p>
                    
                </section>
            </a>
            <footer class="post-card-meta">
                
                    
            </footer>
        </div>
    </article>

            
        </div>
    </div>
</aside>

<!-- Floating header which appears on-scroll, included from includes/floating-header.hbs -->
<div class="floating-header">
    <div class="floating-header-logo">
        <a href="/">
            
            <span>Darron's Devlog</span>
        </a>
    </div>
    <span class="floating-header-divider">&mdash;</span>
    <div class="floating-header-title">ISLR - Chapter 6. Linear Model Selection and Regularization</div>
    <div class="floating-header-share">
        <div class="floating-header-share-label">Share this <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
    <path d="M7.5 15.5V4a1.5 1.5 0 1 1 3 0v4.5h2a1 1 0 0 1 1 1h2a1 1 0 0 1 1 1H18a1.5 1.5 0 0 1 1.5 1.5v3.099c0 .929-.13 1.854-.385 2.748L17.5 23.5h-9c-1.5-2-5.417-8.673-5.417-8.673a1.2 1.2 0 0 1 1.76-1.605L7.5 15.5zm6-6v2m-3-3.5v3.5m6-1v2"/>
</svg>
</div>
        <a class="floating-header-share-tw" href="https://twitter.com/share?text=ISLR+-+Chapter+6.+Linear+Model+Selection+and+Regularization&amp;url=https://12kdh43.github.io/islr_ch6"
            onclick="window.open(this.href, 'share-twitter', 'width=550,height=235');return false;">
            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32"><path d="M30.063 7.313c-.813 1.125-1.75 2.125-2.875 2.938v.75c0 1.563-.188 3.125-.688 4.625a15.088 15.088 0 0 1-2.063 4.438c-.875 1.438-2 2.688-3.25 3.813a15.015 15.015 0 0 1-4.625 2.563c-1.813.688-3.75 1-5.75 1-3.25 0-6.188-.875-8.875-2.625.438.063.875.125 1.375.125 2.688 0 5.063-.875 7.188-2.5-1.25 0-2.375-.375-3.375-1.125s-1.688-1.688-2.063-2.875c.438.063.813.125 1.125.125.5 0 1-.063 1.5-.25-1.313-.25-2.438-.938-3.313-1.938a5.673 5.673 0 0 1-1.313-3.688v-.063c.813.438 1.688.688 2.625.688a5.228 5.228 0 0 1-1.875-2c-.5-.875-.688-1.813-.688-2.75 0-1.063.25-2.063.75-2.938 1.438 1.75 3.188 3.188 5.25 4.25s4.313 1.688 6.688 1.813a5.579 5.579 0 0 1 1.5-5.438c1.125-1.125 2.5-1.688 4.125-1.688s3.063.625 4.188 1.813a11.48 11.48 0 0 0 3.688-1.375c-.438 1.375-1.313 2.438-2.563 3.188 1.125-.125 2.188-.438 3.313-.875z"/></svg>

        </a>
        <a class="floating-header-share-fb" href="https://www.facebook.com/sharer/sharer.php?u=https://12kdh43.github.io/islr_ch6"
            onclick="window.open(this.href, 'share-facebook','width=580,height=296');return false;">
            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32"><path d="M19 6h5V0h-5c-3.86 0-7 3.14-7 7v3H8v6h4v16h6V16h5l1-6h-6V7c0-.542.458-1 1-1z"/></svg>

        </a>
    </div>
    <progress class="progress" value="0">
        <div class="progress-container">
            <span class="progress-bar"></span>
        </div>
    </progress>
</div>


<!-- /post -->

<!-- The #contentFor helper here will send everything inside it up to the matching #block helper found in default.hbs -->


        <!-- Previous/next page links - displayed on every page -->
        

        <!-- The footer at the very bottom of the screen -->
        <footer class="site-footer outer">
            <div class="site-footer-content inner">
                <section class="copyright"><a href="/">Darron's Devlog</a> &copy; 2022</section>
                <!-- 
				<section class="poweredby">Proudly published with <a href="https://jekyllrb.com/">Jekyll</a> &
                    <a href="https://pages.github.com/" target="_blank" rel="noopener">GitHub Pages</a> using
                    <a href="https://github.com/jekyllt/jasper2" target="_blank" rel="noopener">Jasper2</a></section>
                -->
				<nav class="site-footer-nav">
                    <a href="/">Latest Posts</a>
                    
                    
                </nav>
            </div>
        </footer>

    </div>

    <!-- The big email subscribe modal content -->
    
        <div id="subscribe" class="subscribe-overlay">
            <a class="subscribe-overlay-close" href="#"></a>
            <div class="subscribe-overlay-content">
                
                <h1 class="subscribe-overlay-title">Search Darron's Devlog</h1>
                <p class="subscribe-overlay-description">
				</p>
                <span id="searchform" method="post" action="/subscribe/" class="">
    <input class="confirm" type="hidden" name="confirm"  />
    <input class="location" type="hidden" name="location"  />
    <input class="referrer" type="hidden" name="referrer"  />

    <div class="form-group">
        <input class="subscribe-email" onkeyup="myFunc()" 
               id="searchtext" type="text" name="searchtext"  
               placeholder="Search..." />
    </div>
    <script type="text/javascript">
        function myFunc() {
            if(event.keyCode == 13) {
                var url = encodeURIComponent($("#searchtext").val());
                location.href = "/search.html?query=" + url;
            }
        }
    </script>
</span>
            </div>
        </div>
    

    <!-- highlight.js -->
    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.10.0/components/prism-abap.min.js"></script>
    <script>$(document).ready(function() {
      $('pre code').each(function(i, block) {
        hljs.highlightBlock(block);
      });
    });</script>

    <!-- jQuery + Fitvids, which makes all video embeds responsive -->
    <script
        src="https://code.jquery.com/jquery-3.2.1.min.js"
        integrity="sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4="
        crossorigin="anonymous">
    </script>
    <script type="text/javascript" src="/assets/js/jquery.fitvids.js"></script>
    <script type="text/javascript" src="https://demo.ghost.io/assets/js/jquery.fitvids.js?v=724281a32e"></script>


    <!-- Paginator increased to "infinit" in _config.yml -->
    <!-- if paginator.posts  -->
    <!-- <script>
        var maxPages = parseInt('');
    </script>
    <script src="/assets/js/infinitescroll.js"></script> -->
    <!-- /endif -->

    
    <!-- Add Google Analytics  -->
    <!-- Google Analytics Tracking code -->
 <script>
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

  ga('create', '', 'auto');
  ga('send', 'pageview');

 </script>

	
    <!-- The #block helper will pull in data from the #contentFor other template files. In this case, there's some JavaScript which we only want to use in post.hbs, but it needs to be included down here, after jQuery has already loaded. -->
    
        <script>

// NOTE: Scroll performance is poor in Safari
// - this appears to be due to the events firing much more slowly in Safari.
//   Dropping the scroll event and using only a raf loop results in smoother
//   scrolling but continuous processing even when not scrolling
$(document).ready(function () {
    // Start fitVids
    var $postContent = $(".post-full-content");
    $postContent.fitVids();
    // End fitVids

    var progressBar = document.querySelector('progress');
    var header = document.querySelector('.floating-header');
    var title = document.querySelector('.post-full-title');

    var lastScrollY = window.scrollY;
    var lastWindowHeight = window.innerHeight;
    var lastDocumentHeight = $(document).height();
    var ticking = false;

    function onScroll() {
        lastScrollY = window.scrollY;
        requestTick();
    }

    function onResize() {
        lastWindowHeight = window.innerHeight;
        lastDocumentHeight = $(document).height();
        requestTick();
    }

    function requestTick() {
        if (!ticking) {
            requestAnimationFrame(update);
        }
        ticking = true;
    }

    function update() {
        var trigger = title.getBoundingClientRect().top + window.scrollY;
        var triggerOffset = title.offsetHeight + 35;
        var progressMax = lastDocumentHeight - lastWindowHeight;

        // show/hide floating header
        if (lastScrollY >= trigger + triggerOffset) {
            header.classList.add('floating-active');
        } else {
            header.classList.remove('floating-active');
        }

        progressBar.setAttribute('max', progressMax);
        progressBar.setAttribute('value', lastScrollY);

        ticking = false;
    }

    window.addEventListener('scroll', onScroll, {passive: true});
    window.addEventListener('resize', onResize, false);

    update();
});
</script>

    
    <!-- Ghost outputs important scripts and data with this tag - it should always be the very last thing before the closing body tag -->
    <!-- ghost_foot -->

</body>
</html>