Tiramisu-Compiler
diff --git a/‎benchmarks/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎benchmarks/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/dense/clean.sh
Lines changed: 1 addition & 6 deletions b/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/dense/clean.sh
Lines changed: 1 addition & 6 deletions
diff --git a/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/README.md
Lines changed: 9 additions & 1 deletion b/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/README.md
Lines changed: 9 additions & 1 deletion
diff --git a/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/clean.sh
Lines changed: 1 addition & 1 deletion b/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/clean.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/compile_and_run_mkl_sparse.sh
Lines changed: 13 additions & 0 deletions b/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/compile_and_run_mkl_sparse.sh
Lines changed: 13 additions & 0 deletions
diff --git a/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/configure.h
Lines changed: 6 additions & 0 deletions b/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/configure.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/conv_relu_maxpool_generator_mkl.c
Lines changed: 4 additions & 4 deletions b/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/conv_relu_maxpool_generator_mkl.c
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/conv_relu_maxpool_generator_mkl_sparse.cpp
Lines changed: 250 additions & 0 deletions b/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/conv_relu_maxpool_generator_mkl_sparse.cpp
Lines changed: 250 additions & 0 deletions
diff --git a/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/conv_relu_maxpool_generator_mkldnn.cpp
Lines changed: 3 additions & 3 deletions b/‎benchmarks/DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse/conv_relu_maxpool_generator_mkldnn.cpp
Lines changed: 3 additions & 3 deletions
@@ -21,11 +21,13 @@ add_subdirectory(DNN/blocks/LSTM/cpu)
 add_subdirectory(DNN/blocks/LSTM/cpu_lib)
 add_subdirectory(DNN/blocks/LSTM/cpu_lib_sparse)
 add_subdirectory(DNN/blocks/vggBlock/cpu/dense)
+add_subdirectory(DNN/blocks/vggBlock/cpu/sparse)
 add_subdirectory(DNN/blocks/fusedresNet/cpu/dense)
 add_subdirectory(DNN/blocks/fusedresNet_inference/cpu/sparse)
 add_subdirectory(DNN/blocks/fusedresNet_inference/cpu/dense)
 add_subdirectory(DNN/blocks/DenseNetBlock/cpu/dense)
 add_subdirectory(DNN/blocks/Conv-ReLU-MaxPool/cpu/dense)
 add_subdirectory(DNN/blocks/Conv-ReLU-MaxPool/cpu/sparse)
 add_subdirectory(DNN/blocks/Resize-Conv-ReLU-MaxPool/cpu/dense)
+add_subdirectory(DNN/blocks/Resize-Conv-ReLU-MaxPool/cpu/sparse)
 add_subdirectory(DNN/blocks/Conv-Relu-FC-Softmax/cpu/dense)
@@ -1,7 +1,2 @@
-<<<<<<< HEAD
-rm -rf conv_relu_maxpool_generator_tiramisu conv_relu_maxpool_tiramisu.o conv_relu_maxpool_tiramisu.o.h wrapper_nn_block_conv_relu_maxpool conv_relu_maxpool_mkl conv_relu_maxpool_mkldnn mkl_result.txt tiramisu_result.txt tf_model.pb
+rm -rf conv_relu_maxpool_generator_tiramisu conv_relu_maxpool_tiramisu.o conv_relu_maxpool_tiramisu.o.h wrapper_nn_block_conv_relu_maxpool conv_relu_maxpool_mkl conv_relu_maxpool_mkldnn mkl_result.txt tiramisu_result.txt tf_model.pb tvm_autotuning.log
 rm -rf .pkl_memoize_py3 param_tuning.h
-=======
-rm conv_relu_maxpool_generator_tiramisu conv_relu_maxpool_tiramisu.o conv_relu_maxpool_tiramisu.o.h wrapper_nn_block_conv_relu_maxpool conv_relu_maxpool_mkl conv_relu_maxpool_mkldnn mkl_result.txt tiramisu_result.txt tf_model.pb tvm_autotuning.log
-rm -rf .pkl_memoize_py3
->>>>>>> upstream/master
@@ -18,6 +18,9 @@ The files in this folder are organized as follows:
     Intel MKL
         spconv_relu_maxpool_generator_mkl.c: code that calls Intel MKL's dense conv-relu-maxpool.
 
+    Intel MKL Sparse
+        spconv_relu_maxpool_generator_mkl_sparse.cpp: code that calls Intel MKL Sparse's sparse conv-relu-maxpool.
+
 To run this benchmark:
 
     At the directory build/benchmarks/DNN/blocks/Conv-ReLU-Maxpool/cpu/sparse execute
@@ -36,6 +39,11 @@ To run this benchmark:
     then
         ./spconv_relu_maxpool_wrapper
 
+    To compare the result of tiramisu with MKL Sparse execute :
+        ./compile_and_run_mkl_sparse.sh
+    then
+        ./spconv_relu_maxpool_wrapper
+
     execution results could be found in the text files :
-        mkl_result.txt (same for Intel MKL and Intel MKL-DNN)
+        mkl_result.txt (same for Intel MKL, Intel MKL-DNN and Intel MKL Sparse)
         tiramisu_result.txt
@@ -1 +1 @@
-rm -rf generated_spconv_relu_maxpool.o.h generated_spconv_relu_maxpool.o mkl_result.txt tiramisu_result.txt spconv_relu_maxpool_wrapper spconv_relu_maxpool_generator conv_relu_maxpool_mkl conv_relu_maxpool_mkldnn
+rm -rf generated_spconv_relu_maxpool.o.h generated_spconv_relu_maxpool.o mkl_result.txt tiramisu_result.txt spconv_relu_maxpool_wrapper spconv_relu_maxpool_generator conv_relu_maxpool_mkl conv_relu_maxpool_mkldnn conv_relu_maxpool_mkl_sparse
@@ -0,0 +1,13 @@
+#set -x
+
+source ../../../../../configure_paths.sh
+MKLDNNROOT=/usr/local/
+
+export INCLUDES="-I${MKL_PREFIX}/include/ -I${MKLDNNROOT}/include"
+export LIBRARIES="${MKL_FLAGS} -lisl -lz -lpthread -ldl "
+export LIBRARIES_DIR="-L${MKL_PREFIX}/lib/${MKL_LIB_PATH_SUFFIX} -L${MKLDNNROOT}/lib"
+
+source ${MKL_PREFIX}/bin/mklvars.sh ${MKL_LIB_PATH_SUFFIX}
+
+g++ -O3 -DMKL_ILP64 -m64 ${INCLUDES} conv_relu_maxpool_generator_mkl_sparse.cpp -o conv_relu_maxpool_mkl_sparse ${LIBRARIES_DIR} -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -fopenmp -lm -ldl  -lmkldnn
+./conv_relu_maxpool_mkl_sparse
@@ -37,6 +37,12 @@
 #define Y_BL 2
 #define Y_NB_BL (N/Y_BL)
 
+// Parameters for MKL Sparse's IM2COL,
+#define H_BL 32 // Must be a divisor of N
+#define H_NB_BL N/H_BL
+#define W_BL 32 // Must be a divisor of N
+#define W_NB_BL N/W_BL
+
 // Number of features in the input
 #define FIn 3
 // Number of features in the output
 
@@ -40,7 +40,7 @@ static dnnError_t init_conversion(dnnPrimitive_t *cv, float **ptr_out,
 }
 
 // Original version by: Kyle Spafford Adapted for COO Format
-int initRandomSparseMatrix(float matrix[FOut][FIn][K][K], float density, const int KK, const int fin_size, const int fout_size)
+int initRandomSparseMatrix(float matrix[FOut][FIn][K][K], float density, const int KK, const int fin_size, const int fout_size, int seed)
 {
   const int n = KK * KK * fin_size * fout_size * density; // number of non zero elements
   int nnzAssigned = 0;
@@ -50,10 +50,10 @@ int initRandomSparseMatrix(float matrix[FOut][FIn][K][K], float density, const i
   int total_num_entries = KK * KK * fin_size * fout_size;
   double prob = (double)n / ((double) total_num_entries);
 
-  // Randomly decide whether entry i,j gets a value, but ensure n values
+  // Randomly decide whether an entry gets a value, but ensure n values
   // are assigned
   int fillRemaining = 0;
-  srand(1);
+  srand(seed);
   for (int fout = 0; fout < fout_size; fout++)
   {
     for (int fin = 0; fin < fin_size; fin++)
@@ -113,7 +113,7 @@ int main()
     size_t maxpool_kernel_size[] = {2, 2};
     size_t maxpool_strides[] = {2, 2};
     int maxpool_offset[] = {0, 0};
-    initRandomSparseMatrix(conv_filter_param, WEIGHTS_DENSITY, K, FIn, FOut);
+    initRandomSparseMatrix(conv_filter_param, WEIGHTS_DENSITY, K, FIn, FOut, 1);
 
     srand(3);
     // Allocate buffers
 
@@ -0,0 +1,250 @@
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <omp.h>
+#include "mkl.h"
+
+#include "mkldnn.hpp"
+#include "im2col.hpp"
+#include "mkl_spblas.h"
+
+// Original version by: Kyle Spafford Adapted for CSR format
+void initRandomWeights(float* filter_values, MKL_INT* filter_idx, MKL_INT* filter_finptr, const int n, const int KK, const int fin_size, const int fout_size, const int seed)
+{
+    int nnzAssigned = 0;
+    // Figure out the probability that a nonzero should be assigned to a given
+    // spot in the matrix
+    int total_num_entries = KK * KK * fin_size * fout_size;
+    double prob = (double)n / ((double) total_num_entries);
+
+    // Seed random number generator
+    srand(seed);
+
+    // Randomly decide whether an entry gets a value, but ensure n values
+    // are assigned
+    int fillRemaining = 0;
+
+    for (int fout = 0; fout < fout_size; fout++)
+    {
+      filter_finptr[fout] = (MKL_INT)nnzAssigned;
+      for (int fin = 0; fin < fin_size; fin++)
+      {
+        for (int ky = 0; ky < KK; ky++)
+        {
+          for (int kx = 0; kx < KK; kx++)
+          {
+            int numEntriesLeft = total_num_entries - ((fout * KK * KK * fin_size) + (fin * KK * KK) + (ky * KK) + kx);
+            int needToAssign   = n - nnzAssigned;
+            if (numEntriesLeft <= needToAssign) {
+                fillRemaining = 1;
+            }
+            if ((nnzAssigned < n && ((double) rand() / (RAND_MAX + 1.0)) <= prob) || fillRemaining)
+            {
+                filter_idx[nnzAssigned] = (MKL_INT)(fin * KK * KK + ky * KK + kx);
+                filter_values[nnzAssigned] = ((float)(rand()%256 - 128)) / 127.f;
+                nnzAssigned++;
+            }
+          }
+        }
+      }
+    }
+    filter_finptr[fout_size] = nnzAssigned;
+    if (nnzAssigned != n)
+      exit(500);
+}
+
+int generateCSRWeights(float *filter_values, float density, MKL_INT *filter_idx, MKL_INT* filter_finptr, int KK, int fin_size, int fout_size, int seed) {
+    int nNonzero = KK * KK * fin_size * fout_size * density;
+    initRandomWeights(filter_values, filter_idx, filter_finptr, nNonzero, KK, fin_size, fout_size, seed);
+    return nNonzero;
+}
+
+using namespace mkldnn;
+
+int main()
+{
+  std::vector<double> duration_vector;
+
+  engine cpu_engine(engine::kind::cpu, 0);
+  stream cpu_stream(cpu_engine);
+
+  std::vector<primitive> net;
+  std::vector<std::unordered_map<int, memory>> net_args;
+
+  memory::dims pool_strides = {2, 2};
+  memory::dims pool_kernel = {2, 2};
+  memory::dims pool_padding = {0, 0};
+
+  int FNNZ = FOut*FIn*K*K*WEIGHTS_DENSITY;
+  float filter_values[FNNZ];
+  MKL_INT filter_idx[FNNZ]; //MKL_INT
+  MKL_INT filter_finptr[FOut+1];
+  // Generate sparse weights matrix
+  generateCSRWeights(filter_values, WEIGHTS_DENSITY, filter_idx, filter_finptr, K, FIn, FOut, 1);
+
+  // Descriptor of main sparse matrix properties
+  struct matrix_descr descrFilter;
+  // // Structure with sparse matrix stored in CSR format
+  sparse_matrix_t       csrFilter;
+  float alpha = 1.0, beta = 0.0;
+
+  // Create handle with matrix stored in CSR format
+  mkl_sparse_s_create_csr (&csrFilter, SPARSE_INDEX_BASE_ZERO,
+                                  FOut,  // number of rows
+                                  FIn*K*K,  // number of cols
+                                  filter_finptr,
+                                  filter_finptr+1,
+                                  filter_idx,
+                                  filter_values);
+
+  // Analyze sparse matrix; choose proper kernels and workload balancing strategy
+  mkl_sparse_optimize(csrFilter);
+
+  // Create matrix descriptor
+  descrFilter.type = SPARSE_MATRIX_TYPE_GENERAL;
+
+  // Allocate buffers
+  float* input_buf = (float*)malloc(sizeof(float) * FIn * (N + 2) * (N + 2) * BATCH_SIZE);
+  float* conv_bias_buf = (float*)malloc(sizeof(float) * FOut);
+  float* result_buf = (float*)malloc(sizeof(float) * FIn * (N) * (N) * K * K * BATCH_SIZE);
+  float* conv_output_buf = (float*)malloc(sizeof(float) * FOut * (N) * (N) * BATCH_SIZE);
+
+  srand(3);
+  for(int b = 0; b < BATCH_SIZE; ++b)
+    for (int fin = 0; fin < FIn; ++fin)
+      for (int y = 0; y < N + 2; ++y)
+        for (int x = 0; x < N + 2; ++x)
+          input_buf[x + y*(N+2) + fin*(N+2)*(N+2) + b*(N+2)*(N+2)*FIn] = ((float)(rand() % 256 - 128)) / 127.f;
+
+  for (int i = 0; i < FOut; i++)
+      conv_bias_buf[i] = ((float)(rand()%256 - 128)) / 127.f;
+
+  printf("Buffers Initialized\n");
+
+  auto conv_output_md = memory::desc(
+    {BATCH_SIZE, FOut, N, N},
+    memory::data_type::f32,
+    memory::format_tag::nchw
+
+  );
+  auto conv_output_mem = memory(conv_output_md, cpu_engine, conv_output_buf);
+
+  auto relu_desc = eltwise_forward::desc(prop_kind::forward_inference,
+            algorithm::eltwise_relu, conv_output_md,
+            0);
+  auto relu_pd = eltwise_forward::primitive_desc(relu_desc, cpu_engine);
+  net.push_back(eltwise_forward(relu_pd));
+  net_args.push_back({
+    {MKLDNN_ARG_SRC, conv_output_mem},
+    {MKLDNN_ARG_DST, conv_output_mem}
+  });
+
+  auto pool_output_md = memory::desc(
+    {BATCH_SIZE, FOut, N/2, N/2},
+    memory::data_type::f32,
+    memory::format_tag::any
+  );
+
+  auto pool_d = pooling_forward::desc(
+    prop_kind::forward_inference,
+    algorithm::pooling_max,
+    conv_output_md,
+    pool_output_md,
+    pool_strides,
+    pool_kernel,
+    pool_padding,
+    pool_padding
+  );
+
+  auto pool_pd = pooling_forward::primitive_desc(
+    pool_d,
+    cpu_engine
+  );
+
+  auto pool_dst_mem = memory(pool_pd.dst_desc(), cpu_engine);
+
+  net.push_back(pooling_forward(pool_pd));
+  net_args.push_back({
+    {MKLDNN_ARG_SRC, conv_output_mem},
+    {MKLDNN_ARG_DST, pool_dst_mem}
+  });
+
+  omp_set_num_threads(4);
+  for (int i = 0; i < NB_TESTS; ++i) {
+    double start = rtclock();
+    for(int batch = 0; batch<BATCH_SIZE; batch++){
+      im2col_cpu(&input_buf[batch*(FIn*(N+2)*(N+2))], FIn,
+        N+2, N+2, K, K,
+        1, 1,
+        &result_buf[batch*(FIn*N*N*K*K)]
+      );
+      // Filter weights are (FOut) * (FIn * K * K)
+      // Lowered Input is   (FIn * K * K) * (N * N)
+      // The result of the mult is : (FOut) * (N * N)
+      // Calculates C = alpha*A*B + C
+      mkl_sparse_s_mm(SPARSE_OPERATION_NON_TRANSPOSE,
+                      alpha,
+                      csrFilter,
+                      descrFilter,
+                      SPARSE_LAYOUT_ROW_MAJOR,
+                      &result_buf[batch*(FIn*N*N*K*K)],
+                      N*N,
+                      N*N,
+                      beta,
+                      &conv_output_buf[batch*(FOut*N*N)],
+                      N*N
+      );
+      #pragma omp parallel for
+      for(int fout = 0; fout<FOut; fout++){
+        for(int y=0; y<N; y++)
+          for(int x=0; x<N; x++)
+            conv_output_buf[batch*(FOut*N*N) + fout*N*N + y*N + x] += conv_bias_buf[fout];
+      }
+    }
+    // Execute relu/maxpool
+    for (size_t j = 0; j < net.size(); ++j)
+      net[j].execute(cpu_stream, net_args[j]);
+    cpu_stream.wait();
+
+    double end = rtclock();
+    duration_vector.push_back((end - start) * 1000);
+  }
+
+  std::cout << "\t\tSparse Lowered Convolution time : "
+  << median(duration_vector) << " ms" << std::endl;
+
+  auto output_usr_md = memory::desc(
+    {BATCH_SIZE, FOut, N/2, N/2},
+    memory::data_type::f32,
+    memory::format_tag::nchw
+  );
+
+  auto output_mem = memory(output_usr_md, cpu_engine);
+  reorder(pool_dst_mem, output_mem)
+    .execute(cpu_stream, pool_dst_mem, output_mem);
+
+  if (WRITE_RESULT_TO_FILE){
+    float* output_buf = (float*)output_mem.get_data_handle();
+    // Write results to file
+    FILE* f = fopen("mkl_result.txt", "w");
+    if (f == NULL) {
+      printf("Error creating mkl_sparse_result.txt.\n");
+      return 0;
+    }
+
+    for(int b=0; b<BATCH_SIZE; b++)
+      for(int fout=0; fout<FOut; fout++)
+        for(int y=0; y<N/2; y++)
+          for(int x=0; x<N/2; x++)
+            fprintf(f, "%.17g\n", output_buf[x + y*N/2 + fout*N/2*N/2 + b*N/2*N/2*FOut]);
+
+    fclose(f);
+  }
+  mkl_sparse_destroy(csrFilter);
+  free(input_buf);
+  free(result_buf);
+  free(conv_output_buf);
+  return 0;
+}
@@ -7,7 +7,7 @@ using namespace mkldnn;
 using namespace std;
 
 // Original version by: Kyle Spafford Adapted for COO Format
-int initRandomSparseMatrix(float* matrix, float density, const int KK, const int fin_size, const int fout_size)
+int initRandomSparseMatrix(float* matrix, float density, const int KK, const int fin_size, const int fout_size, int seed)
 {
   const int n = KK * KK * fin_size * fout_size * density; // number of non zero elements
   int nnzAssigned = 0;
@@ -20,7 +20,7 @@ int initRandomSparseMatrix(float* matrix, float density, const int KK, const int
   // Randomly decide whether entry i,j gets a value, but ensure n values
   // are assigned
   int fillRemaining = 0;
-  srand(1);
+  srand(seed);
   for (int fout = 0; fout < fout_size; fout++)
   {
     for (int fin = 0; fin < fin_size; fin++)
@@ -77,7 +77,7 @@ void conv_relu_maxpool_block()
   std::vector<float> conv_bias_buf(FOut);
   std::vector<float> conv_weights_buf(FOut * FIn * K * K);
 
-  initRandomSparseMatrix(conv_weights_buf.data(), WEIGHTS_DENSITY, K, FIn, FOut);
+  initRandomSparseMatrix(conv_weights_buf.data(), WEIGHTS_DENSITY, K, FIn, FOut, 1);
 
   srand(3);
   for (int i = 0; i < BATCH_SIZE*FIn*(N + 2)*(N + 2); i++)
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-rm -rf generated_spconv_relu_maxpool.o.h generated_spconv_relu_maxpool.o mkl_result.txt tiramisu_result.txt spconv_relu_maxpool_wrapper spconv_relu_maxpool_generator conv_relu_maxpool_mkl conv_relu_maxpool_mkldnn`
	`1`	`+rm -rf generated_spconv_relu_maxpool.o.h generated_spconv_relu_maxpool.o mkl_result.txt tiramisu_result.txt spconv_relu_maxpool_wrapper spconv_relu_maxpool_generator conv_relu_maxpool_mkl conv_relu_maxpool_mkldnn conv_relu_maxpool_mkl_sparse`