From 3d389aa8afbd56c85a4a744572885ddf0b423c2b Mon Sep 17 00:00:00 2001
From: David Nadlinger <code@klickverbot.at>
Date: Mon, 19 Oct 2015 23:38:55 +0200
Subject: [PATCH 1/5] Cleanup: Remove tracking of number of multiplications

The statistic is currently wrong for us anyway, and not very useful for using
the algorithm (as opposed to developing it). If somebody is really interested
in this, they can just supply a custom matrix multiplication operator in Julia.
---
 src/expmv_fun.jl            | 12 ++----------
 src/normAm.jl               |  4 +---
 src/select_taylor_degree.jl | 16 ++++------------
 3 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/src/expmv_fun.jl b/src/expmv_fun.jl
index 80ec01d..030dc91 100644
--- a/src/expmv_fun.jl
+++ b/src/expmv_fun.jl
@@ -4,13 +4,10 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
                # bal = false, 
 
     #EXPMV   Matrix exponential times vector or matrix.
-    #   [F,S,M,MV,MVD] = EXPMV(t,A,B,[],PREC) computes EXPM(t*A)*B without
+    #   [F,S,M] = EXPMV(t,A,B,[],PREC) computes EXPM(t*A)*B without
     #   explicitly forming EXPM(t*A). PREC is the required accuracy, 'double',
     #   'single' or 'half', and defaults to CLASS(A).
     #
-    #   A total of MV products with A or A^* are used, of which MVD are
-    #   for norm estimation.
-    #
     #   The full syntax is
     #
     #     [f,s,m,mv,mvd,unA] = expmv(t,A,b,M,prec,shift,bal,full_term,prnt).
@@ -48,12 +45,9 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
     
     if isempty(M)
         tt = 1
-        (M,mvd,alpha,unA) = select_taylor_degree(t*A,b)
-        mv = mvd
+        (M,alpha,unA) = select_taylor_degree(t*A,b)
     else
         tt = t
-        mv = 0
-        mvd = 0
     end
     
     tol =   
@@ -104,7 +98,6 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
         c1 = norm(b,Inf);
         for k = 1:m
             b = (t/(s*k))*(A*b);
-            mv = mv + 1;
             f =  f + b;
             c2 = norm(b,Inf);
             if !full_term
@@ -130,6 +123,5 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
     #    f = D*f
     #end
     
-    #return (f,s,m,mv,mvd,unA)
     return f
 end
\ No newline at end of file
diff --git a/src/normAm.jl b/src/normAm.jl
index ed2ffbc..897bf40 100644
--- a/src/normAm.jl
+++ b/src/normAm.jl
@@ -56,15 +56,13 @@ function normAm(A,m)
             e = A'*e;
         end
         c = norm(e,Inf);
-        mv = m;
     else
         #(c,_,_,it) = normest1(@afun_power,t);
         #mv = it[2]*t*m;
         c = norm(A^m,1);
-        mv = 6*t*m;
     end
     
-    return (c,mv)
+    return c
     
 end
 
diff --git a/src/select_taylor_degree.jl b/src/select_taylor_degree.jl
index 9748738..9eb72c6 100644
--- a/src/select_taylor_degree.jl
+++ b/src/select_taylor_degree.jl
@@ -1,6 +1,3 @@
-#function  [M,mv,alpha,unA] = ...
-#           select_taylor_degree(A,b,m_max,p_max,prec,shift,bal,force_estm)
-
 function select_taylor_degree(A,
                               b;
                               m_max = 55,
@@ -12,10 +9,9 @@ function select_taylor_degree(A,
 
 
     #SELECT_TAYLOR_DEGREE   Select degree of Taylor approximation.
-    #   [M,MV,alpha,unA] = SELECT_TAYLOR_DEGREE(A,m_max,p_max) forms a matrix M
+    #   [M,alpha,unA] = SELECT_TAYLOR_DEGREE(A,m_max,p_max) forms a matrix M
     #   for use in determining the truncated Taylor series degree in EXPMV
     #   and EXPMV_TSPAN, based on parameters m_max and p_max.
-    #   MV is the number of matrix-vector products with A or A^* computed.
     
     #   Reference: A. H. Al-Mohy and N. J. Higham, Computing the action of
     #   the matrix exponential, with an application to exponential
@@ -48,8 +44,6 @@ function select_taylor_degree(A,
         A = A-mu*speye(n);
     end
     
-    mv = 0;
-    
     if !force_estm
         normA = norm(A,1)
     end
@@ -63,9 +57,8 @@ function select_taylor_degree(A,
         eta = zeros(p_max,1); 
         alpha = zeros(p_max-1,1);
         for p = 1:p_max
-            (c,k) = normAm(A,p+1);
+            c = normAm(A,p+1);
             c = c^(1/(p+1));
-            mv = mv + k;
             eta[p] = c;
         end
         for p = 1:p_max-1
@@ -81,6 +74,5 @@ function select_taylor_degree(A,
         end
     end
     
-    return (M,mv,alpha,unA)
-    
-end
\ No newline at end of file
+    return (M,alpha,unA)
+end

From 73ba460c1e43192c4853a345684378512c0bbd24 Mon Sep 17 00:00:00 2001
From: David Nadlinger <code@klickverbot.at>
Date: Mon, 19 Oct 2015 23:42:22 +0200
Subject: [PATCH 2/5] Cleanup: Remove unused return values of
 select_taylor_degree

---
 src/expmv_fun.jl            | 9 +++------
 src/select_taylor_degree.jl | 4 +---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/expmv_fun.jl b/src/expmv_fun.jl
index 030dc91..07f9c5c 100644
--- a/src/expmv_fun.jl
+++ b/src/expmv_fun.jl
@@ -10,9 +10,7 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
     #
     #   The full syntax is
     #
-    #     [f,s,m,mv,mvd,unA] = expmv(t,A,b,M,prec,shift,bal,full_term,prnt).
-    #
-    #   unA = 1 if the alpha_p were used instead of norm(A).
+    #     f = expmv(t,A,b,M,prec,shift,bal,full_term,prnt).
     #
     #   If repeated invocation of EXPMV is required for several values of t
     #   or B, it is recommended to provide M as an external parameter as
@@ -38,14 +36,13 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
     n = size(A,1)
     
     if shift
-        mu = trace(A)/n 
-        #mu = full(mu) # Much slower without the full!
+        mu = trace(A)/n
         A = A-mu*speye(n)
     end
     
     if isempty(M)
         tt = 1
-        (M,alpha,unA) = select_taylor_degree(t*A,b)
+        M = select_taylor_degree(t * A, b)
     else
         tt = t
     end
diff --git a/src/select_taylor_degree.jl b/src/select_taylor_degree.jl
index 9eb72c6..a609c8d 100644
--- a/src/select_taylor_degree.jl
+++ b/src/select_taylor_degree.jl
@@ -49,11 +49,9 @@ function select_taylor_degree(A,
     end
     
     if !force_estm && normA <= 4*theta[m_max]*p_max*(p_max + 3)/(m_max*size(b,2));
-        unA = 1;
         c = normA;
         alpha = c*ones(p_max-1,1);
     else
-        unA = 0;
         eta = zeros(p_max,1); 
         alpha = zeros(p_max-1,1);
         for p = 1:p_max
@@ -74,5 +72,5 @@ function select_taylor_degree(A,
         end
     end
     
-    return (M,alpha,unA)
+    return M
 end

From f3cf785508f13d64260474cd4af86bb476f3e85a Mon Sep 17 00:00:00 2001
From: David Nadlinger <code@klickverbot.at>
Date: Mon, 19 Oct 2015 23:45:33 +0200
Subject: [PATCH 3/5] Make it clearer that select_taylor_degree only needs the
 width of b, not the values

---
 src/expmv_fun.jl            | 4 ++--
 src/select_taylor_degree.jl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/expmv_fun.jl b/src/expmv_fun.jl
index 07f9c5c..0c971eb 100644
--- a/src/expmv_fun.jl
+++ b/src/expmv_fun.jl
@@ -14,7 +14,7 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
     #
     #   If repeated invocation of EXPMV is required for several values of t
     #   or B, it is recommended to provide M as an external parameter as
-    #   M = SELECT_TAYLOR_DEGREE(A,m_max,p_max,prec,shift,bal,true).
+    #   M = SELECT_TAYLOR_DEGREE(A,b_columns,m_max,p_max,prec,shift,bal,true).
     #   This also allows choosing different m_max and p_max.
     
     #   Reference: A. H. Al-Mohy and N. J. Higham, Computing the action of
@@ -42,7 +42,7 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
     
     if isempty(M)
         tt = 1
-        M = select_taylor_degree(t * A, b)
+        M = select_taylor_degree(t * A, size(b, 2))
     else
         tt = t
     end
diff --git a/src/select_taylor_degree.jl b/src/select_taylor_degree.jl
index a609c8d..4378d83 100644
--- a/src/select_taylor_degree.jl
+++ b/src/select_taylor_degree.jl
@@ -1,5 +1,5 @@
 function select_taylor_degree(A,
-                              b;
+                              b_columns;
                               m_max = 55,
                               p_max = 8,
                               prec = "double",
@@ -48,7 +48,7 @@ function select_taylor_degree(A,
         normA = norm(A,1)
     end
     
-    if !force_estm && normA <= 4*theta[m_max]*p_max*(p_max + 3)/(m_max*size(b,2));
+    if !force_estm && normA <= 4*theta[m_max]*p_max*(p_max + 3)/(m_max*b_columns);
         c = normA;
         alpha = c*ones(p_max-1,1);
     else

From c2f33b4dc3c19bf5b9a033f903f0002b09403080 Mon Sep 17 00:00:00 2001
From: David Nadlinger <code@klickverbot.at>
Date: Tue, 20 Oct 2015 00:17:58 +0200
Subject: [PATCH 4/5] Fix most obvious type instabilities

This does not necessarily have a big impact on performance, but
helps when looking at the generated code (or code_warntype, for
that matter).
---
 src/expmv_fun.jl | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/expmv_fun.jl b/src/expmv_fun.jl
index 0c971eb..43bb1d1 100644
--- a/src/expmv_fun.jl
+++ b/src/expmv_fun.jl
@@ -1,6 +1,6 @@
 export expmv
 
-function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = false, prnt = false)
+function expmv(t, A, b; M::Array{Float64, 2} = Array(Float64, 0, 0), prec = "double", shift = false, full_term = false, prnt = false)
                # bal = false, 
 
     #EXPMV   Matrix exponential times vector or matrix.
@@ -41,22 +41,22 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
     end
     
     if isempty(M)
-        tt = 1
+        tt = 1.0
         M = select_taylor_degree(t * A, size(b, 2))
     else
         tt = t
     end
     
-    tol =   
-      if prec == "double"
-          2.0^(-53)
+    tol =
+      if prec == "half"
+          2.0^(-10)
       elseif prec == "single"
           2.0^(-24)
-      elseif prec == "half"   
-          2.0^(-10)
+      else
+          2.0^(-53)
       end
     
-    s = 1;
+    s = 1.0;
     
     if t == 0
         m = 0;
@@ -74,15 +74,13 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
             cost,m = findmin(C);  # when C is one column. Happens if p_max = 2.
         end
         if cost == Inf
-            cost = 0 
+            cost = 0.0
         end
-        s = max(cost/m,1);
+        s = max(cost/m,1.0);
     end
     
-    eta = 1;
-    
     if shift 
-        eta = exp(t*mu/s)
+        eta = exp(t * mu / s)
     end
     
     f = b;
@@ -108,7 +106,9 @@ function expmv(t, A, b; M = [], prec = "double", shift = false, full_term = fals
             end
             
         end
-        f = eta*f
+        if shift
+            f = eta*f
+        end
         b = f
     end
     

From 3eced9f9db3bb8b133bf5adc1e5062c73ceec0ba Mon Sep 17 00:00:00 2001
From: David Nadlinger <code@klickverbot.at>
Date: Tue, 20 Oct 2015 23:22:23 +0200
Subject: [PATCH 5/5] Use in-place operations in inner loop

---
 src/expmv_fun.jl | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/expmv_fun.jl b/src/expmv_fun.jl
index 43bb1d1..e2d1839 100644
--- a/src/expmv_fun.jl
+++ b/src/expmv_fun.jl
@@ -82,34 +82,37 @@ function expmv(t, A, b; M::Array{Float64, 2} = Array(Float64, 0, 0), prec = "dou
     if shift 
         eta = exp(t * mu / s)
     end
-    
-    f = b;
-    
-    # if prnt 
-    #     fprintf("m = %2.0f, s = %g, m_actual = ", m, s) 
-    # end
-    
-    for i = 1:s
-        c1 = norm(b,Inf);
+
+    f = copy(b)
+    b1 = copy(b)
+    b2 = similar(b)
+    @inbounds for i = 1:s
+        c1 = norm(b1, Inf)
         for k = 1:m
-            b = (t/(s*k))*(A*b);
-            f =  f + b;
-            c2 = norm(b,Inf);
+            A_mul_B!(b2, A, b1)
+            @simd for l in 1:n
+                b2[l] *= (t / (s * k))
+            end
+            (b1, b2) = (b2, b1)
+
+            @simd for l in 1:n
+                f[l] = f[l] + b1[l]
+            end
+            c2 = norm(b1, Inf)
             if !full_term
-                if c1 + c2 <= tol*norm(f,Inf)
+                if c1 + c2 <= tol * norm(f, Inf)
                     # if prnt
                     #     fprintf(" %2.0f, ", k)
                     # end
                     break
                 end
-                c1 = c2;
+                c1 = c2
             end
-            
         end
         if shift
-            f = eta*f
+            scale!(f, eta)
         end
-        b = f
+        copy!(b1, f)
     end
     
     # if prnt