cs231n_lec8.html

<!DOCTYPE html>
<html>
<head>

    <!-- Document Settings -->
    <meta charset="utf-8" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
	
	<!-- On Post front-matter YAML, set "use_math: true" to use LaTex -->
	
	  
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
    TeX: {
        equationNumbers: {
        autoNumber: "AMS"
        }
    },
    tex2jax: {
    inlineMath: [ ['$', '$'], ["\\(","\\)"]  ],
    displayMath: [ ['$$', '$$'], ["\\[","\\]"]  ],
    processEscapes: true,
    }
});
MathJax.Hub.Register.MessageHook("Math Processing Error",function (message) {
        alert("Math Processing Error: "+message[1]);
    });
MathJax.Hub.Register.MessageHook("TeX Jax - parse error",function (message) {
        alert("Math Processing Error: "+message[1]);
    });
</script>

<script type="text/javascript" async
    src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
	

    <!-- Base Meta -->
    <!-- dynamically fixing the title for tag/author pages -->


    <title>cs231n - Lecture 8. Training Neural Networks II</title>
    <meta name="HandheldFriendly" content="True" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <!-- Styles'n'Scripts -->
    <link rel="stylesheet" type="text/css" href="/assets/built/screen.css" />
    <link rel="stylesheet" type="text/css" href="/assets/built/screen.edited.css" />
    <link rel="stylesheet" type="text/css" href="/assets/built/syntax.css" />

    <!-- syntax.css -->
    <link rel="stylesheet" type="text/css" href="/assets/built/syntax.css" />
	
    <!-- highlight.js -->
    <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/default.min.css">
    <style>.hljs { background: none; }</style>

    <!--[if IE]>
        <style>
            p, ol, ul{
                width: 100%;
            }
            blockquote{
                width: 100%;
            }
        </style>
    <![endif]-->
    
    <!-- This tag outputs SEO meta+structured data and other important settings -->
    <meta name="description" content="" />
    <link rel="shortcut icon" href="http://0.0.0.0:4000/assets/built/images/favicon.jpg" type="image/png" />
    <link rel="canonical" href="http://0.0.0.0:4000/cs231n_lec8" />
    <meta name="referrer" content="no-referrer-when-downgrade" />

     <!--title below is coming from _includes/dynamic_title-->
    <meta property="og:site_name" content="Darron's Devlog" />
    <meta property="og:type" content="website" />
    <meta property="og:title" content="cs231n - Lecture 8. Training Neural Networks II" />
    <meta property="og:description" content="Optimization Problems with SGD What if loss changes quickly in one direction and slowly in another? What does gradient descent do? Very slow progress along shallow dimension, jitter along steep direction What if the loss function has a local minima or saddle point? Zero gradient, gradient descent gets stuck(more common" />
    <meta property="og:url" content="http://0.0.0.0:4000/cs231n_lec8" />
    <meta property="og:image" content="http://0.0.0.0:4000/assets/built/images/blog-cover1.png" />
    <meta property="article:publisher" content="https://www.facebook.com/" />
    <meta property="article:author" content="https://www.facebook.com/" />
    <meta property="article:published_time" content="2021-12-27T15:00:00+00:00" />
    <meta property="article:modified_time" content="2021-12-27T15:00:00+00:00" />
    <meta property="article:tag" content="cs231n" />
    <meta name="twitter:card" content="summary_large_image" />
    <meta name="twitter:title" content="cs231n - Lecture 8. Training Neural Networks II" />
    <meta name="twitter:description" content="Optimization Problems with SGD What if loss changes quickly in one direction and slowly in another? What does gradient descent do? Very slow progress along shallow dimension, jitter along steep direction What if the loss function has a local minima or saddle point? Zero gradient, gradient descent gets stuck(more common" />
    <meta name="twitter:url" content="http://0.0.0.0:4000/" />
    <meta name="twitter:image" content="http://0.0.0.0:4000/assets/built/images/blog-cover1.png" />
    <meta name="twitter:label1" content="Written by" />
    <meta name="twitter:data1" content="Darron's Devlog" />
    <meta name="twitter:label2" content="Filed under" />
    <meta name="twitter:data2" content="cs231n" />
    <meta name="twitter:site" content="@" />
    <meta name="twitter:creator" content="@" />
    <meta property="og:image:width" content="1400" />
    <meta property="og:image:height" content="933" />

    <script type="application/ld+json">
{
    "@context": "https://schema.org",
    "@type": "Website",
    "publisher": {
        "@type": "Organization",
        "name": "Darron's Devlog",
        "logo": "http://0.0.0.0:4000/"
    },
    "url": "http://0.0.0.0:4000/cs231n_lec8",
    "image": {
        "@type": "ImageObject",
        "url": "http://0.0.0.0:4000/assets/built/images/blog-cover1.png",
        "width": 2000,
        "height": 666
    },
    "mainEntityOfPage": {
        "@type": "WebPage",
        "@id": "http://0.0.0.0:4000/cs231n_lec8"
    },
    "description": "Optimization Problems with SGD What if loss changes quickly in one direction and slowly in another? What does gradient descent do? Very slow progress along shallow dimension, jitter along steep direction What if the loss function has a local minima or saddle point? Zero gradient, gradient descent gets stuck(more common"
}
    </script>

    <!-- <script type="text/javascript" src="https://demo.ghost.io/public/ghost-sdk.min.js?v=724281a32e"></script>
    <script type="text/javascript">
    ghost.init({
    	clientId: "ghost-frontend",
    	clientSecret: "f84a07a72b17"
    });
    </script> -->

    <meta name="generator" content="Jekyll 3.6.2" />
    <link rel="alternate" type="application/rss+xml" title="cs231n - Lecture 8. Training Neural Networks II" href="/feed.xml" />


</head>
<body class="post-template">

    <div class="site-wrapper">
        <!-- All the main content gets inserted here, index.hbs, post.hbs, etc -->
        <!-- default -->

<!-- The tag above means: insert everything in this file
into the {body} of the default.hbs template -->

<header class="site-header outer">
    <div class="inner">
        <nav class="site-nav">
    <div class="site-nav-left">
        
            
                <a class="site-nav-logo" href="/">Darron's Devlog</a>
            
        
            <ul class="nav" role="menu">
    <li class="nav-home" role="menuitem"><a href="/">Home</a></li>
    <li class="nav-about" role="menuitem"><a href="/about/">About</a></li>
    <li class="nav-projects" role="menuitem"><a href="/tag/projects/">Projects</a></li>
    <li class="nav-studies" role="menuitem"><a href="/tag/studies/">Studies</a></li>
	<li class="nav-blog" role="menuitem"><a href="/tag/blog/">Blog</a></li>
    <li class="nav-archive" role="menuitem">
        <a href="/archive.html">All Posts</a>
    </li>
</ul>
        
    </div>
    <div class="site-nav-right">
        <div class="social-links">
            
            
        </div>
        
            <a class="subscribe-button" href="#subscribe">Search</a>
        
    </div>
</nav>

    </div>
</header>

<!-- Everything inside the #post tags pulls data from the post -->
<!-- #post -->

<main id="site-main" class="site-main outer" role="main">
    <div class="inner">

        <article class="post-full  tag-cs231n  no-image">

            <header class="post-full-header">
                <section class="post-full-meta">
                    <time class="post-full-meta-date" datetime="27 December 2021">27 December 2021</time>
                    
                        <span class="date-divider">/</span>
                        
							
                               <a href='/tag/cs231n/'>CS231N</a>
                            
                        
                </section>
                <h1 class="post-full-title">cs231n - Lecture 8. Training Neural Networks II</h1>
            </header>
	<!--
            
	-->
            <section class="post-full-content">
                <div class="kg-card-markdown">
                    <h2 id="optimization">Optimization</h2>

<h3 id="problems-with-sgd">Problems with SGD</h3>
<ol>
  <li>
    <p>What if loss changes quickly in one direction and slowly in another? What does gradient descent do?
 Very slow progress along shallow dimension, jitter along steep direction</p>
  </li>
  <li>
    <p>What if the loss function has a local minima or saddle point?<br />
 Zero gradient, gradient descent gets stuck(more common in high dimension)</p>
  </li>
  <li>
    <p>Gradients come from minibatches can be noisy</p>
  </li>
</ol>

<h3 id="sgd--momentum">SGD + Momentum</h3>
<ul>
  <li>To avoid local minima, combine gradient at current point with <em>velocity</em> to get step used to update weights; continue moving in the general direction as the previous iterations<br />
  \(v_{t+1}=\rho v_t + \nabla f(x_t)\)<br />
  \(x_{t+1}=x_t - \alpha v_{t+1}\)<br />
  with <em>rho</em> giving “friction”; typically <em>0.9</em> or <em>0.99</em></li>
</ul>

<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">vx</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">while</span> <span class="bp">True</span><span class="p">:</span>
	<span class="n">dx</span> <span class="o">=</span> <span class="n">compute_gradient</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
	<span class="n">vx</span> <span class="o">=</span> <span class="n">rho</span> <span class="o">*</span> <span class="n">vx</span> <span class="o">+</span> <span class="n">dx</span>
	<span class="n">x</span> <span class="o">-=</span> <span class="n">learning_rate</span> <span class="o">*</span> <span class="n">vx</span>
</code></pre></div></div>

<h3 id="nesterov-momentum">Nesterov Momentum</h3>
<ul>
  <li>“Look ahead” to the point where updating using velocity would take us; compute gradient there and mix it with velocity to get actual update direction<br />
  \(v_{t+1}=\rho v_t - \alpha\nabla f(x_t + \rho v_t)\)<br />
  \(x_{t+1}=x_t + v_{t+1}\)<br />
  rearrange with \(\tilde{x}_t = x_t + \rho v_t\),<br />
  \(v_{t+1}=\rho v_t - \alpha\nabla f(\tilde{x}_t)\)<br />
  \(\begin{align*}
  \tilde{x}_{t+1} &amp;= \tilde{x}_t - \rho v_t + (1+\rho)v_{t+1}
                  &amp;= \tilde{x}_t + v_{t+1} + \rho(v_{t+1}-v_t)
  \end{align*}\)</li>
</ul>

<h3 id="adagrad">AdaGrad</h3>
<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">grad_squared</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">while</span> <span class="bp">True</span><span class="p">:</span>
	<span class="n">dx</span> <span class="o">=</span> <span class="n">compute_gradient</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
	<span class="n">grad_squared</span> <span class="o">+=</span> <span class="n">dx</span> <span class="o">*</span> <span class="n">dx</span>
	<span class="n">x</span> <span class="o">-=</span> <span class="n">learning_rate</span> <span class="o">*</span> <span class="n">dx</span> <span class="o">/</span> <span class="p">(</span><span class="n">np</span><span class="p">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">grad_squared</span><span class="p">)</span> <span class="o">+</span> <span class="mf">1e-7</span><span class="p">)</span>
</code></pre></div></div>

<ul>
  <li>Added element-wise scaling of the gradient based on the historical sum of squares in each dimension<br />
  “Per-parameter learning rates” or “adaptive learning rates”<br />
  Progress along “steep” directions is damped and “flat” directions is accelerated<br />
  Step size decays to zero over time</li>
</ul>

<h3 id="rmsprop-leaky-adagrad">RMSProp: “Leaky AdaGrad”</h3>
<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">grad_squared</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">while</span> <span class="bp">True</span><span class="p">:</span>
	<span class="n">dx</span> <span class="o">=</span> <span class="n">compute_gradient</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
	<span class="n">grad_squared</span> <span class="o">=</span> <span class="n">decay_rate</span> <span class="o">*</span> <span class="n">grad_squared</span> <span class="o">+</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">decay_rate</span><span class="p">)</span> <span class="o">*</span> <span class="n">dx</span> <span class="o">*</span> <span class="n">dx</span>
	<span class="n">x</span> <span class="o">-=</span> <span class="n">learning_rate</span> <span class="o">*</span> <span class="n">dx</span> <span class="o">/</span> <span class="p">(</span><span class="n">np</span><span class="p">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">grad_squared</span><span class="p">)</span> <span class="o">+</span> <span class="mf">1e-7</span><span class="p">)</span>
</code></pre></div></div>

<h3 id="adam">Adam</h3>
<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">first_moment</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">second_moment</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">num_iterations</span><span class="p">):</span>
	<span class="n">dx</span> <span class="o">=</span> <span class="n">compute_gradient</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
	<span class="n">first_moment</span> <span class="o">=</span> <span class="n">beta1</span> <span class="o">*</span> <span class="n">first_moment</span> <span class="o">+</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">beta1</span><span class="p">)</span> <span class="o">*</span> <span class="n">dx</span>		<span class="c1"># Momentum
</span>	<span class="n">second_moment</span> <span class="o">=</span> <span class="n">beta2</span> <span class="o">*</span> <span class="n">second_moment</span> <span class="o">+</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">beta2</span><span class="p">)</span> <span class="o">*</span> <span class="n">dx</span> <span class="o">*</span> <span class="n">dx</span>
	<span class="n">first_unbias</span> <span class="o">=</span> <span class="n">first_moment</span> <span class="o">/</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">beta1</span> <span class="o">**</span> <span class="n">t</span><span class="p">)</span>			<span class="c1"># Bias correction
</span>	<span class="n">second_unbias</span> <span class="o">=</span> <span class="n">second_moment</span> <span class="o">/</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">beta2</span> <span class="o">**</span> <span class="n">t</span><span class="p">)</span>
	<span class="n">x</span> <span class="o">-=</span> <span class="n">learning_rate</span> <span class="o">*</span> <span class="n">first_unbias</span> <span class="o">/</span> <span class="p">(</span><span class="n">np</span><span class="p">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">second_unbias</span><span class="p">)</span> <span class="o">+</span> <span class="mf">1e-7</span><span class="p">)</span>	<span class="c1"># AdaGrad/ RMSProp
</span></code></pre></div></div>

<ul>
  <li>Sort of like RMSProp with momentum
  Bias correction for the fact that first and second moment estimates start at zero<br />
  Adam with <em>beta1 = 0.9, beta2 = 0.999, and learning_rate = 1e-3 or 5e-4</em> is a great starting point</li>
</ul>

<h2 id="learning-rate-schedules">Learning rate schedules</h2>

<h3 id="learning-rate-decays-over-time">Learning rate decays over time</h3>
<ul>
  <li>Reduce learning rate by a certain value at a few fixed points(after some epochs)</li>
</ul>

<h3 id="learning-rate-decay">Learning Rate Decay</h3>
<ul>
  <li>
    <p>Reduce learning rate gradually, e.g.<br />
  Cosine:  $\alpha_t = \frac{1}{2}\alpha_0(1+\mbox{cos}(t\pi / T))$<br />
  Linear: $\alpha_t = \alpha_0(1-t/T)$<br />
  Inverse sqrt: $\alpha_t = \alpha_0 / \sqrt{t}$<br />
  while $\alpha_0$ is the initial learning rate, $\alpha_t$ is one at epoch <em>t</em>, and <em>T</em> is the total number of epochs</p>
  </li>
  <li>
    <p>Linear Warmup<br />
  High initial learning rates can make loss explode; linearly increasing learning rate from <em>0</em> over the first <em>~5000</em> iterations can prevent this<br />
  Empirical rule of thumb: If you increase the batch size by <em>N</em>, also scale the initial learning rate by <em>N</em></p>
  </li>
</ul>

<h3 id="first-order-optimization">First-Order Optimization</h3>
<ol>
  <li>Use gradient from linear approximation</li>
  <li>Step to minimize the approximation</li>
</ol>

<h3 id="second-order-optimization">Second-Order Optimization</h3>
<ol>
  <li>Use gradient and <strong>Hessian</strong> to form <strong>quadratic</strong> approximation</li>
  <li>Step to the <strong>minima</strong> of the (quadratic) approximation</li>
</ol>

<ul>
  <li>But Hessian has <em>O(N^2)</em> elements and inverting takes <em>O(N^3)</em>, <em>N</em> is extremely large
    <ul>
      <li>Quasi-Newton methods (BGFS most popular):<br />
  instead of inverting the Hessian, approximate inverse Hessian with rank 1 updates over time</li>
      <li>L-BFGS (Limited memory BFGS):<br />
  Does not form/store the full inverse Hessian. Usually works very well in full batch, deterministic mode, but does not transfer very well to mini-batch setting. Large-scale, stochastic setting is an active area of research.</li>
    </ul>
  </li>
</ul>

<h3 id="summary">Summary</h3>
<ul>
  <li>Adam is a good default choice in many cases; even with constant learning rate</li>
  <li>SGD+Momentum can outperform Adam but may equire more tuning of LR and schedule. Cosine schedule preferred, since it has very few hyperparameters.</li>
  <li>L-BFGS is good if you can afford to do full batch updates.</li>
</ul>

<h2 id="improve-test-error">Improve test error</h2>
<ul>
  <li>Better optimization algorithms help reduce <strong>training</strong> loss, but what we really care is about error on new data - how to reduce the gap?</li>
</ul>

<h3 id="early-stopping-always-do-this">Early Stopping: Always do this</h3>
<ul>
  <li>Stop training the model when accuracy on the validation set decreases. Or train for a long time, but always keep track of the model snapshot that worked best on val.</li>
</ul>

<h3 id="model-ensembles">Model Ensembles</h3>
<ol>
  <li>Train multiple independent models</li>
  <li>At test time average their results<br />
 (Take average of predicted probability distributions, then choose argmax)</li>
</ol>

<h3 id="regularization">Regularization</h3>
<ul>
  <li>
    <p>To improve single-model performance, add terms to loss<br />
  e.g. L1, L2, Elastic net.</p>
  </li>
  <li>
    <p>or, use Dropout:<br />
  In each forward pass, randomly set some neurons to zero. Probability of dropping is a hyperparameter; 0.5 is common.<br />
  It forces the network to have a redundant representation; Prevents co-adaptation of features. Dropout can be interpreted as training a large ensemble of models (that share parameters).</p>
  </li>
</ul>

<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">p</span> <span class="o">=</span> <span class="mf">0.5</span> <span class="c1"># dropout rate
</span>
<span class="k">def</span> <span class="nf">train_step</span><span class="p">(</span><span class="n">X</span><span class="p">):</span>		<span class="c1"># drop in train time
</span>	<span class="n">H1</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">maximum</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">np</span><span class="p">.</span><span class="n">dot</span><span class="p">(</span><span class="n">W1</span><span class="p">,</span> <span class="n">X</span><span class="p">)</span> <span class="o">+</span> <span class="n">b1</span><span class="p">)</span>
	<span class="n">U1</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">random</span><span class="p">.</span><span class="n">rand</span><span class="p">(</span><span class="o">*</span><span class="n">H1</span><span class="p">.</span><span class="n">shape</span><span class="p">)</span> <span class="o">&lt;</span> <span class="n">p</span>
	<span class="n">H1</span> <span class="o">*=</span> <span class="n">U1</span>
	<span class="n">H2</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">maximum</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">np</span><span class="p">.</span><span class="n">dot</span><span class="p">(</span><span class="n">W2</span><span class="p">,</span> <span class="n">H1</span><span class="p">)</span> <span class="o">+</span> <span class="n">b2</span><span class="p">)</span>
	<span class="n">U2</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">random</span><span class="p">.</span><span class="n">rand</span><span class="p">(</span><span class="o">*</span><span class="n">H2</span><span class="p">.</span><span class="n">shape</span><span class="p">)</span> <span class="o">&lt;</span> <span class="n">p</span>
	<span class="n">H2</span> <span class="o">*=</span> <span class="n">U2</span>
	<span class="n">out</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">dot</span><span class="p">(</span><span class="n">W3</span><span class="p">,</span> <span class="n">H2</span><span class="p">)</span> <span class="o">+</span> <span class="n">b3</span>

	<span class="c1"># backward pass: compute gradients...
</span>	<span class="c1"># perform parameter update...
</span>
<span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="n">X</span><span class="p">):</span>			<span class="c1"># scale at test time
</span>	<span class="n">H1</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">maximum</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">np</span><span class="p">.</span><span class="n">dot</span><span class="p">(</span><span class="n">W1</span><span class="p">,</span> <span class="n">X</span><span class="p">)</span> <span class="o">+</span> <span class="n">b1</span><span class="p">)</span> <span class="o">*</span> <span class="n">p</span>
	<span class="n">H2</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">maximum</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">np</span><span class="p">.</span><span class="n">dot</span><span class="p">(</span><span class="n">W2</span><span class="p">,</span> <span class="n">H1</span><span class="p">)</span> <span class="o">+</span> <span class="n">b2</span><span class="p">)</span> <span class="o">*</span> <span class="n">p</span>
	<span class="n">out</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">dot</span><span class="p">(</span><span class="n">W3</span><span class="p">,</span> <span class="n">H2</span><span class="p">)</span> <span class="o">+</span> <span class="n">b3</span>
</code></pre></div></div>

<ul>
  <li>
    <p>more common: “Inverted dropout”<br />
  <code class="language-plaintext highlighter-rouge">U1 = (np.random.rand(*H1.shape) &lt; p) / p</code> in train time and no scaling in test time</p>
  </li>
  <li>
    <p>A common pattern of regularization<br />
  Training: Add some kind of randomness<br />
  $y = fw(x,z)$<br />
  Testing: Average out randomness (sometimes approximate)<br />
  \(y = f(x) = E_z[f(x,z)] = \int p(z)f(x,z)\, dz\)</p>
  </li>
  <li>
    <p>Data Augmentation:<br />
  Addes <em>transformed</em> data to train model<br />
  e.g. translation, rotation, stretching, shearing, lens distortions.</p>
  </li>
  <li>
    <p>DropConnect:<br />
  Training: Drop connections between neurons (set weights to 0)<br />
  Testing: Use all the connections</p>
  </li>
  <li>
    <p>Fractional Pooling:<br />
  Training: Use randomized pooling regions<br />
  Testing: Average predictions from several regions</p>
  </li>
  <li>
    <p>Stochastic Depth:<br />
  Training: Skip some layers in the network<br />
  Testing: Use all the layer</p>
  </li>
  <li>
    <p>Cutout:<br />
  Training: Set random image regions to zero<br />
  Testing: Use full image</p>
  </li>
  <li>
    <p>Mixup:<br />
  Training: Train on random blends of images<br />
  Testing: Use original images<br />
  e.g. Randomly blend the pixels of pairs of training images, say 40% cat and 60% dog, and set the target label as cat:0.4 and dog:0.6.</p>
  </li>
  <li>
    <p>Summary
  Consider dropout for large fully-connected layers<br />
  Batch normalization and data augmentation almost always a good idea<br />
  Try cutout and mixup especially for small classification datasets</p>
  </li>
</ul>

<h2 id="choosing-hyperparameters">Choosing Hyperparameters</h2>
<ul>
  <li>
    <p>Step 1: Check initial loss<br />
  Turn off weight decay, sanity check loss at initialization<br />
  e.g. <em>log(C)</em> for softmax with <em>C</em> classes</p>
  </li>
  <li>
    <p>Step 2: Overfit a small sample<br />
  Try to train to 100% training accuracy on a small sample of training data (~5-10 minibatches); fiddle with architecture, learning rate, weight initialization<br />
  If loss is not going down, LR too low or bad initialization. If loss explodes, then LR is too high or bad initialization.</p>
  </li>
  <li>
    <p>Step 3: Find LR that makes loss go down<br />
  Use the architecture from the previous step, use all training data, turn on small weight decay, find a learning rate that makes the loss drop significantly within ~100 iterations.</p>
  </li>
  <li>
    <p>Step 4: Coarse grid, train for ~1-5 epochs<br />
  Choose a few values of learning rate and weight decay around what worked from Step 3, train a few models for ~1-5 epochs.</p>
  </li>
  <li>
    <p>Step 5: Refine grid, train longer<br />
  Pick best models from Step 4, train them for longer (~10-20 epochs) without learning rate decay</p>
  </li>
  <li>
    <p>Step 6: Look at loss and accuracy curves<br />
  If accuracy still going up, you need to train longer. If it goes down, huge train / val gap means overfitting. You need to increase regularization or get more data. If there’s no gap between train / val, it means underfitting. Train longer or use a bigger model.</p>
  </li>
  <li>
    <p>Look at learning curves<br />
  Losses may be noisy, use a scatter plot and also plot moving average to see trends better. Cross-validation is useful too.</p>
  </li>
  <li>
    <p>Step 7: <strong>GO TO Step 5</strong></p>
  </li>
  <li>
    <p>Hyperparameters to play with:<br />
  network architecture,<br />
  learning rate, its decay schedule, update type,<br />
  regularization (L2/Dropout strength)</p>
  </li>
  <li>
    <p>for Hyper-Parameter Optimization, consider both Random Search and Grid Search</p>
  </li>
</ul>

<h3 id="summary-1">Summary</h3>
<ul>
  <li>Improve your training error:
    <ul>
      <li>Optimizers</li>
      <li>Learning rate schedules</li>
    </ul>
  </li>
  <li>Improve your test error:
    <ul>
      <li>Regularization</li>
      <li>Choosing Hyperparameters</li>
    </ul>
  </li>
</ul>


                </div>
            </section>

            <!-- Email subscribe form at the bottom of the page -->
	<!--
            
                <section class="subscribe-form">
                    <h3 class="subscribe-form-title">Subscribe to Darron's Devlog</h3>
                    <p>Get the latest posts delivered right to your inbox</p>
                    <span id="searchform" method="post" action="/subscribe/" class="">
    <input class="confirm" type="hidden" name="confirm"  />
    <input class="location" type="hidden" name="location"  />
    <input class="referrer" type="hidden" name="referrer"  />

    <div class="form-group">
        <input class="subscribe-email" onkeyup="myFunc()" 
               id="searchtext" type="text" name="searchtext"  
               placeholder="Search..." />
    </div>
    <script type="text/javascript">
        function myFunc() {
            if(event.keyCode == 13) {
                var url = encodeURIComponent($("#searchtext").val());
                location.href = "/search.html?query=" + url;
            }
        }
    </script>
</span>
                </section>
            
	-->
            <footer class="post-full-footer">
                <!-- Everything inside the #author tags pulls data from the author -->
                <!-- #author-->
                
                    
                <!-- /author  -->
            </footer>

            <!-- If you use Disqus comments, just uncomment this block.
            The only thing you need to change is "test-apkdzgmqhj" - which
            should be replaced with your own Disqus site-id. -->
            
                <section class="post-full-comments">
                    <div id="disqus_thread"></div>
                    <script>
                        var disqus_config = function () {
                            var this_page_url = 'http://0.0.0.0:4000/cs231n_lec8';
                            var this_page_identifier = '/cs231n_lec8';
                            var this_page_title = 'cs231n - Lecture 8. Training Neural Networks II';
                        };
                        (function() {
                            var d = document, s = d.createElement('script');
                            s.src = 'https://.disqus.com/embed.js';
                            s.setAttribute('data-timestamp', +new Date());
                            (d.head || d.body).appendChild(s);
                        })();
                    </script>
                </section>
            

        </article>

    </div>
</main>

<!-- Links to Previous/Next posts -->
<aside class="read-next outer">
    <div class="inner">
        <div class="read-next-feed">
            
                
                    <article class="read-next-card"
                        
                            style="background-image: url(/assets/built/images/blog-cover1.png)"
                        
                    >
                        <header class="read-next-card-header">
                            <small class="read-next-card-header-sitetitle">&mdash; Darron's Devlog &mdash;</small>
                            
                                <h3 class="read-next-card-header-title"><a href="/tag/cs231n/">Cs231n</a></h3>
                            
                        </header>
                        <div class="read-next-divider"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 14.5s2 3 5 3 5.5-2.463 5.5-5.5S21 6.5 18 6.5c-5 0-7 11-12 11C2.962 17.5.5 15.037.5 12S3 6.5 6 6.5s4.5 3.5 4.5 3.5"/></svg>
</div>
                        <div class="read-next-card-content">
                            <ul>
                                
                                
                                            <li><a href="/cs231n_lec15">cs231n - Lecture 15. Detection and Segmentation</a></li>
                                        
                                    
                                            <li><a href="/cs231n_lec14">cs231n - Lecture 14. Visualizing and Understanding</a></li>
                                        
                                    
                                            <li><a href="/cs231n_lec13">cs231n - Lecture 13. Self-Supervised Learning</a></li>
                                        
                                    
                            </ul>
                        </div>
                        <footer class="read-next-card-footer">
                            <a href="/tag/cs231n/">
                                
                                    See all 13 posts  →
                                
                            </a>
                        </footer>
                    </article>
                
            
            <!-- If there's a next post, display it using the same markup included from - partials/post-card.hbs -->
            
                    <article class="post-card post-template no-image">
        
        <div class="post-card-content">
            <a class="post-card-content-link" href="/cs231n_lec9">
                <header class="post-card-header">
                    
                        
                                <span class="post-card-tags">Cs231n</span>
                            
                        
                    <h2 class="post-card-title">cs231n - Lecture 9. CNN Architectures</h2>
                </header>
                <section class="post-card-excerpt">
                    
                        <p>Review LeCun et al., 1998 $5\times 5$ Conv filters applied at stride 1 $2\times 2$ Subsampling (Pooling) layers applied at stride 2 i.e. architecture is [CONV-POOL-CONV-POOL-FC-FC] Stride: Downsample output activations Padding: Preserve input</p>
                    
                </section>
            </a>
            <footer class="post-card-meta">
                
                    
            </footer>
        </div>
    </article>

            
            <!-- If there's a previous post, display it using the same markup included from - partials/post-card.hbs -->
            
                    <article class="post-card post-template no-image">
        
        <div class="post-card-content">
            <a class="post-card-content-link" href="/cs231n_lec7">
                <header class="post-card-header">
                    
                        
                                <span class="post-card-tags">Cs231n</span>
                            
                        
                    <h2 class="post-card-title">cs231n - Lecture 7. Training Neural Networks I</h2>
                </header>
                <section class="post-card-excerpt">
                    
                        <p>Activation Functions Sigmoid $\sigma(x)=1/(1+e^{-x})$ Squashes numbers to range [0,1] Historically popular since they have nice interpretation as a saturating “firing rate” of a neuron. Problem: Gradient Vanishing: Saturated neurons “kill” the gradients; If</p>
                    
                </section>
            </a>
            <footer class="post-card-meta">
                
                    
            </footer>
        </div>
    </article>

            
        </div>
    </div>
</aside>

<!-- Floating header which appears on-scroll, included from includes/floating-header.hbs -->
<div class="floating-header">
    <div class="floating-header-logo">
        <a href="/">
            
            <span>Darron's Devlog</span>
        </a>
    </div>
    <span class="floating-header-divider">&mdash;</span>
    <div class="floating-header-title">cs231n - Lecture 8. Training Neural Networks II</div>
    <div class="floating-header-share">
        <div class="floating-header-share-label">Share this <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
    <path d="M7.5 15.5V4a1.5 1.5 0 1 1 3 0v4.5h2a1 1 0 0 1 1 1h2a1 1 0 0 1 1 1H18a1.5 1.5 0 0 1 1.5 1.5v3.099c0 .929-.13 1.854-.385 2.748L17.5 23.5h-9c-1.5-2-5.417-8.673-5.417-8.673a1.2 1.2 0 0 1 1.76-1.605L7.5 15.5zm6-6v2m-3-3.5v3.5m6-1v2"/>
</svg>
</div>
        <a class="floating-header-share-tw" href="https://twitter.com/share?text=cs231n+-+Lecture+8.+Training+Neural+Networks+II&amp;url=https://12kdh43.github.io/cs231n_lec8"
            onclick="window.open(this.href, 'share-twitter', 'width=550,height=235');return false;">
            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32"><path d="M30.063 7.313c-.813 1.125-1.75 2.125-2.875 2.938v.75c0 1.563-.188 3.125-.688 4.625a15.088 15.088 0 0 1-2.063 4.438c-.875 1.438-2 2.688-3.25 3.813a15.015 15.015 0 0 1-4.625 2.563c-1.813.688-3.75 1-5.75 1-3.25 0-6.188-.875-8.875-2.625.438.063.875.125 1.375.125 2.688 0 5.063-.875 7.188-2.5-1.25 0-2.375-.375-3.375-1.125s-1.688-1.688-2.063-2.875c.438.063.813.125 1.125.125.5 0 1-.063 1.5-.25-1.313-.25-2.438-.938-3.313-1.938a5.673 5.673 0 0 1-1.313-3.688v-.063c.813.438 1.688.688 2.625.688a5.228 5.228 0 0 1-1.875-2c-.5-.875-.688-1.813-.688-2.75 0-1.063.25-2.063.75-2.938 1.438 1.75 3.188 3.188 5.25 4.25s4.313 1.688 6.688 1.813a5.579 5.579 0 0 1 1.5-5.438c1.125-1.125 2.5-1.688 4.125-1.688s3.063.625 4.188 1.813a11.48 11.48 0 0 0 3.688-1.375c-.438 1.375-1.313 2.438-2.563 3.188 1.125-.125 2.188-.438 3.313-.875z"/></svg>

        </a>
        <a class="floating-header-share-fb" href="https://www.facebook.com/sharer/sharer.php?u=https://12kdh43.github.io/cs231n_lec8"
            onclick="window.open(this.href, 'share-facebook','width=580,height=296');return false;">
            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32"><path d="M19 6h5V0h-5c-3.86 0-7 3.14-7 7v3H8v6h4v16h6V16h5l1-6h-6V7c0-.542.458-1 1-1z"/></svg>

        </a>
    </div>
    <progress class="progress" value="0">
        <div class="progress-container">
            <span class="progress-bar"></span>
        </div>
    </progress>
</div>


<!-- /post -->

<!-- The #contentFor helper here will send everything inside it up to the matching #block helper found in default.hbs -->


        <!-- Previous/next page links - displayed on every page -->
        

        <!-- The footer at the very bottom of the screen -->
        <footer class="site-footer outer">
            <div class="site-footer-content inner">
                <section class="copyright"><a href="/">Darron's Devlog</a> &copy; 2022</section>
                <!-- 
				<section class="poweredby">Proudly published with <a href="https://jekyllrb.com/">Jekyll</a> &
                    <a href="https://pages.github.com/" target="_blank" rel="noopener">GitHub Pages</a> using
                    <a href="https://github.com/jekyllt/jasper2" target="_blank" rel="noopener">Jasper2</a></section>
                -->
				<nav class="site-footer-nav">
                    <a href="/">Latest Posts</a>
                    
                    
                </nav>
            </div>
        </footer>

    </div>

    <!-- The big email subscribe modal content -->
    
        <div id="subscribe" class="subscribe-overlay">
            <a class="subscribe-overlay-close" href="#"></a>
            <div class="subscribe-overlay-content">
                
                <h1 class="subscribe-overlay-title">Search Darron's Devlog</h1>
                <p class="subscribe-overlay-description">
				</p>
                <span id="searchform" method="post" action="/subscribe/" class="">
    <input class="confirm" type="hidden" name="confirm"  />
    <input class="location" type="hidden" name="location"  />
    <input class="referrer" type="hidden" name="referrer"  />

    <div class="form-group">
        <input class="subscribe-email" onkeyup="myFunc()" 
               id="searchtext" type="text" name="searchtext"  
               placeholder="Search..." />
    </div>
    <script type="text/javascript">
        function myFunc() {
            if(event.keyCode == 13) {
                var url = encodeURIComponent($("#searchtext").val());
                location.href = "/search.html?query=" + url;
            }
        }
    </script>
</span>
            </div>
        </div>
    

    <!-- highlight.js -->
    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.10.0/components/prism-abap.min.js"></script>
    <script>$(document).ready(function() {
      $('pre code').each(function(i, block) {
        hljs.highlightBlock(block);
      });
    });</script>

    <!-- jQuery + Fitvids, which makes all video embeds responsive -->
    <script
        src="https://code.jquery.com/jquery-3.2.1.min.js"
        integrity="sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4="
        crossorigin="anonymous">
    </script>
    <script type="text/javascript" src="/assets/js/jquery.fitvids.js"></script>
    <script type="text/javascript" src="https://demo.ghost.io/assets/js/jquery.fitvids.js?v=724281a32e"></script>


    <!-- Paginator increased to "infinit" in _config.yml -->
    <!-- if paginator.posts  -->
    <!-- <script>
        var maxPages = parseInt('');
    </script>
    <script src="/assets/js/infinitescroll.js"></script> -->
    <!-- /endif -->

    
    <!-- Add Google Analytics  -->
    <!-- Google Analytics Tracking code -->
 <script>
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

  ga('create', '', 'auto');
  ga('send', 'pageview');

 </script>

	
    <!-- The #block helper will pull in data from the #contentFor other template files. In this case, there's some JavaScript which we only want to use in post.hbs, but it needs to be included down here, after jQuery has already loaded. -->
    
        <script>

// NOTE: Scroll performance is poor in Safari
// - this appears to be due to the events firing much more slowly in Safari.
//   Dropping the scroll event and using only a raf loop results in smoother
//   scrolling but continuous processing even when not scrolling
$(document).ready(function () {
    // Start fitVids
    var $postContent = $(".post-full-content");
    $postContent.fitVids();
    // End fitVids

    var progressBar = document.querySelector('progress');
    var header = document.querySelector('.floating-header');
    var title = document.querySelector('.post-full-title');

    var lastScrollY = window.scrollY;
    var lastWindowHeight = window.innerHeight;
    var lastDocumentHeight = $(document).height();
    var ticking = false;

    function onScroll() {
        lastScrollY = window.scrollY;
        requestTick();
    }

    function onResize() {
        lastWindowHeight = window.innerHeight;
        lastDocumentHeight = $(document).height();
        requestTick();
    }

    function requestTick() {
        if (!ticking) {
            requestAnimationFrame(update);
        }
        ticking = true;
    }

    function update() {
        var trigger = title.getBoundingClientRect().top + window.scrollY;
        var triggerOffset = title.offsetHeight + 35;
        var progressMax = lastDocumentHeight - lastWindowHeight;

        // show/hide floating header
        if (lastScrollY >= trigger + triggerOffset) {
            header.classList.add('floating-active');
        } else {
            header.classList.remove('floating-active');
        }

        progressBar.setAttribute('max', progressMax);
        progressBar.setAttribute('value', lastScrollY);

        ticking = false;
    }

    window.addEventListener('scroll', onScroll, {passive: true});
    window.addEventListener('resize', onResize, false);

    update();
});
</script>

    
    <!-- Ghost outputs important scripts and data with this tag - it should always be the very last thing before the closing body tag -->
    <!-- ghost_foot -->

</body>
</html>