diff --git a/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb b/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb
new file mode 100644
index 0000000..5355bef
--- /dev/null
+++ b/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb
@@ -0,0 +1,1557 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 0. Load python libs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import glob\n",
+ "import json\n",
+ "import nmrglue as ng\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pickle as pickle\n",
+ "import pandas as pd\n",
+ "import random\n",
+ "import math\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from collections import Counter, defaultdict, OrderedDict\n",
+ "\n",
+ "from scipy import interpolate\n",
+ "\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from lib.carrier import SpectraCarrier\n",
+ "import lib.utils as utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2020.09.1\n"
+ ]
+ }
+ ],
+ "source": [
+ "import rdkit\n",
+ "print(rdkit.__version__)\n",
+ "\n",
+ "from rdkit import Chem\n",
+ "from rdkit import RDLogger\n",
+ "RDLogger.DisableLog('rdApp.*')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from lib.ifg import identify_functional_groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. Load dataframes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_DIR = './data'\n",
+ "SOURCE_DIR = './data/source'\n",
+ "TARGET_DIR = './data/target'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fn_train_df = '{}/train_df.pk'.format(SOURCE_DIR)\n",
+ "fn_valid_df = '{}/valid_df.pk'.format(SOURCE_DIR)\n",
+ "fn_test_df = '{}/test_df.pk'.format(SOURCE_DIR)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loaded_train_df = None\n",
+ "with open(fn_train_df, 'rb') as file:\n",
+ " loaded_train_df = pickle.load(file)\n",
+ " \n",
+ "loaded_valid_df = None\n",
+ "with open(fn_valid_df, 'rb') as file:\n",
+ " loaded_valid_df = pickle.load(file)\n",
+ " \n",
+ "loaded_test_df = None\n",
+ "with open(fn_test_df, 'rb') as file:\n",
+ " loaded_test_df = pickle.load(file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_columns', None)\n",
+ "pd.set_option('display.max_rows', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " fn | \n",
+ " cano_smi | \n",
+ " CO | \n",
+ " cOC | \n",
+ " COC(-,:C)=O | \n",
+ " cnc | \n",
+ " cCl | \n",
+ " cO | \n",
+ " CCl | \n",
+ " CC(-,:C)=O | \n",
+ " cBr | \n",
+ " c[N&+](=O)[O&-] | \n",
+ " cC(-,:C)=O | \n",
+ " C=CC | \n",
+ " cN | \n",
+ " cC(=O)OC | \n",
+ " COC | \n",
+ " CF | \n",
+ " CBr | \n",
+ " coc | \n",
+ " cF | \n",
+ " CC(=O)O | \n",
+ " c=O | \n",
+ " c[n&H1]c | \n",
+ " csc | \n",
+ " cC=O | \n",
+ " CNC | \n",
+ " CN | \n",
+ " CN(-,:C)C | \n",
+ " cC#N | \n",
+ " cn(-,:c)C | \n",
+ " cC(=O)O | \n",
+ " CC=C(-,:C)C | \n",
+ " CC#N | \n",
+ " cNC(-,:C)=O | \n",
+ " cNC | \n",
+ " C/C=C/C | \n",
+ " CC=CC | \n",
+ " C=C(-,:C)C | \n",
+ " C#CC | \n",
+ " cC(-,:c)=O | \n",
+ " cN(-,:C)C | \n",
+ " CC(=O)OC | \n",
+ " CC#CC | \n",
+ " cI | \n",
+ " CNC(-,:C)=O | \n",
+ " cC=Cc | \n",
+ " c-n(-,:c)c | \n",
+ " cnn(-,:c)C | \n",
+ " cnnc | \n",
+ " cP(-,:c)c | \n",
+ " CS | \n",
+ " spectrum | \n",
+ " (W)-CO | \n",
+ " (W)-cOC | \n",
+ " (W)-COC(-,:C)=O | \n",
+ " (W)-cnc | \n",
+ " (W)-cCl | \n",
+ " (W)-cO | \n",
+ " (W)-CCl | \n",
+ " (W)-CC(-,:C)=O | \n",
+ " (W)-cBr | \n",
+ " (W)-c[N&+](=O)[O&-] | \n",
+ " (W)-cC(-,:C)=O | \n",
+ " (W)-C=CC | \n",
+ " (W)-cN | \n",
+ " (W)-cC(=O)OC | \n",
+ " (W)-COC | \n",
+ " (W)-CF | \n",
+ " (W)-CBr | \n",
+ " (W)-coc | \n",
+ " (W)-cF | \n",
+ " (W)-CC(=O)O | \n",
+ " (W)-c=O | \n",
+ " (W)-c[n&H1]c | \n",
+ " (W)-csc | \n",
+ " (W)-cC=O | \n",
+ " (W)-CNC | \n",
+ " (W)-CN | \n",
+ " (W)-CN(-,:C)C | \n",
+ " (W)-cC#N | \n",
+ " (W)-cn(-,:c)C | \n",
+ " (W)-cC(=O)O | \n",
+ " (W)-CC=C(-,:C)C | \n",
+ " (W)-CC#N | \n",
+ " (W)-cNC(-,:C)=O | \n",
+ " (W)-cNC | \n",
+ " (W)-C/C=C/C | \n",
+ " (W)-CC=CC | \n",
+ " (W)-C=C(-,:C)C | \n",
+ " (W)-C#CC | \n",
+ " (W)-cC(-,:c)=O | \n",
+ " (W)-cN(-,:C)C | \n",
+ " (W)-CC(=O)OC | \n",
+ " (W)-CC#CC | \n",
+ " (W)-cI | \n",
+ " (W)-CNC(-,:C)=O | \n",
+ " (W)-cC=Cc | \n",
+ " (W)-c-n(-,:c)c | \n",
+ " (W)-cnn(-,:c)C | \n",
+ " (W)-cnnc | \n",
+ " (W)-cP(-,:c)c | \n",
+ " (W)-CS | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1136-86-3 | \n",
+ " COc1cc(C(C)=O)cc(OC)c1OC | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " None | \n",
+ " 0.567877 | \n",
+ " 7.202310 | \n",
+ " 0.540732 | \n",
+ " 0.532957 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 0.519961 | \n",
+ " 15.059375 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 610-54-8 | \n",
+ " CCOc1ccc([N+](=O)[O-])cc1[N+](=O)[O-] | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " None | \n",
+ " 0.567877 | \n",
+ " 7.202310 | \n",
+ " 0.540732 | \n",
+ " 0.532957 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 14.326757 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 645-36-3 | \n",
+ " CCOC(CN)OCC | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " None | \n",
+ " 0.567877 | \n",
+ " 0.541339 | \n",
+ " 0.540732 | \n",
+ " 0.532957 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 0.519961 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 32.721605 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " fn cano_smi CO cOC COC(-,:C)=O \\\n",
+ "0 1136-86-3 COc1cc(C(C)=O)cc(OC)c1OC 0 1 0 \n",
+ "3 610-54-8 CCOc1ccc([N+](=O)[O-])cc1[N+](=O)[O-] 0 1 0 \n",
+ "5 645-36-3 CCOC(CN)OCC 0 0 0 \n",
+ "\n",
+ " cnc cCl cO CCl CC(-,:C)=O cBr c[N&+](=O)[O&-] cC(-,:C)=O C=CC cN \\\n",
+ "0 0 0 0 0 0 0 0 1 0 0 \n",
+ "3 0 0 0 0 0 0 1 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " cC(=O)OC COC CF CBr coc cF CC(=O)O c=O c[n&H1]c csc cC=O CNC \\\n",
+ "0 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CN CN(-,:C)C cC#N cn(-,:c)C cC(=O)O CC=C(-,:C)C CC#N cNC(-,:C)=O \\\n",
+ "0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 \n",
+ "5 1 0 0 0 0 0 0 0 \n",
+ "\n",
+ " cNC C/C=C/C CC=CC C=C(-,:C)C C#CC cC(-,:c)=O cN(-,:C)C CC(=O)OC \\\n",
+ "0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CC#CC cI CNC(-,:C)=O cC=Cc c-n(-,:c)c cnn(-,:c)C cnnc cP(-,:c)c CS \\\n",
+ "0 0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " spectrum (W)-CO (W)-cOC (W)-COC(-,:C)=O (W)-cnc (W)-cCl (W)-cO \\\n",
+ "0 None 0.567877 7.202310 0.540732 0.532957 0.53604 0.528167 \n",
+ "3 None 0.567877 7.202310 0.540732 0.532957 0.53604 0.528167 \n",
+ "5 None 0.567877 0.541339 0.540732 0.532957 0.53604 0.528167 \n",
+ "\n",
+ " (W)-CCl (W)-CC(-,:C)=O (W)-cBr (W)-c[N&+](=O)[O&-] (W)-cC(-,:C)=O \\\n",
+ "0 0.526322 0.521537 0.516949 0.519961 15.059375 \n",
+ "3 0.526322 0.521537 0.516949 14.326757 0.518953 \n",
+ "5 0.526322 0.521537 0.516949 0.519961 0.518953 \n",
+ "\n",
+ " (W)-C=CC (W)-cN (W)-cC(=O)OC (W)-COC (W)-CF (W)-CBr (W)-coc \\\n",
+ "0 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "3 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "5 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "\n",
+ " (W)-cF (W)-CC(=O)O (W)-c=O (W)-c[n&H1]c (W)-csc (W)-cC=O \\\n",
+ "0 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "3 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "5 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "\n",
+ " (W)-CNC (W)-CN (W)-CN(-,:C)C (W)-cC#N (W)-cn(-,:c)C (W)-cC(=O)O \\\n",
+ "0 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "3 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "5 0.508441 32.721605 0.508441 0.505985 0.507156 0.50705 \n",
+ "\n",
+ " (W)-CC=C(-,:C)C (W)-CC#N (W)-cNC(-,:C)=O (W)-cNC (W)-C/C=C/C \\\n",
+ "0 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "3 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "5 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "\n",
+ " (W)-CC=CC (W)-C=C(-,:C)C (W)-C#CC (W)-cC(-,:c)=O (W)-cN(-,:C)C \\\n",
+ "0 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "3 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "5 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "\n",
+ " (W)-CC(=O)OC (W)-CC#CC (W)-cI (W)-CNC(-,:C)=O (W)-cC=Cc \\\n",
+ "0 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "3 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "5 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "\n",
+ " (W)-c-n(-,:c)c (W)-cnn(-,:c)C (W)-cnnc (W)-cP(-,:c)c (W)-CS \n",
+ "0 0.501979 0.503132 0.502398 0.502922 0.502398 \n",
+ "3 0.501979 0.503132 0.502398 0.502922 0.502398 \n",
+ "5 0.501979 0.503132 0.502398 0.502922 0.502398 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loaded_train_df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " fn | \n",
+ " cano_smi | \n",
+ " CO | \n",
+ " cOC | \n",
+ " COC(-,:C)=O | \n",
+ " cnc | \n",
+ " cCl | \n",
+ " cO | \n",
+ " CCl | \n",
+ " CC(-,:C)=O | \n",
+ " cBr | \n",
+ " c[N&+](=O)[O&-] | \n",
+ " cC(-,:C)=O | \n",
+ " C=CC | \n",
+ " cN | \n",
+ " cC(=O)OC | \n",
+ " COC | \n",
+ " CF | \n",
+ " CBr | \n",
+ " coc | \n",
+ " cF | \n",
+ " CC(=O)O | \n",
+ " c=O | \n",
+ " c[n&H1]c | \n",
+ " csc | \n",
+ " cC=O | \n",
+ " CNC | \n",
+ " CN | \n",
+ " CN(-,:C)C | \n",
+ " cC#N | \n",
+ " cn(-,:c)C | \n",
+ " cC(=O)O | \n",
+ " CC=C(-,:C)C | \n",
+ " CC#N | \n",
+ " cNC(-,:C)=O | \n",
+ " cNC | \n",
+ " C/C=C/C | \n",
+ " CC=CC | \n",
+ " C=C(-,:C)C | \n",
+ " C#CC | \n",
+ " cC(-,:c)=O | \n",
+ " cN(-,:C)C | \n",
+ " CC(=O)OC | \n",
+ " CC#CC | \n",
+ " cI | \n",
+ " CNC(-,:C)=O | \n",
+ " cC=Cc | \n",
+ " c-n(-,:c)c | \n",
+ " cnn(-,:c)C | \n",
+ " cnnc | \n",
+ " cP(-,:c)c | \n",
+ " CS | \n",
+ " spectrum | \n",
+ " (W)-CO | \n",
+ " (W)-cOC | \n",
+ " (W)-COC(-,:C)=O | \n",
+ " (W)-cnc | \n",
+ " (W)-cCl | \n",
+ " (W)-cO | \n",
+ " (W)-CCl | \n",
+ " (W)-CC(-,:C)=O | \n",
+ " (W)-cBr | \n",
+ " (W)-c[N&+](=O)[O&-] | \n",
+ " (W)-cC(-,:C)=O | \n",
+ " (W)-C=CC | \n",
+ " (W)-cN | \n",
+ " (W)-cC(=O)OC | \n",
+ " (W)-COC | \n",
+ " (W)-CF | \n",
+ " (W)-CBr | \n",
+ " (W)-coc | \n",
+ " (W)-cF | \n",
+ " (W)-CC(=O)O | \n",
+ " (W)-c=O | \n",
+ " (W)-c[n&H1]c | \n",
+ " (W)-csc | \n",
+ " (W)-cC=O | \n",
+ " (W)-CNC | \n",
+ " (W)-CN | \n",
+ " (W)-CN(-,:C)C | \n",
+ " (W)-cC#N | \n",
+ " (W)-cn(-,:c)C | \n",
+ " (W)-cC(=O)O | \n",
+ " (W)-CC=C(-,:C)C | \n",
+ " (W)-CC#N | \n",
+ " (W)-cNC(-,:C)=O | \n",
+ " (W)-cNC | \n",
+ " (W)-C/C=C/C | \n",
+ " (W)-CC=CC | \n",
+ " (W)-C=C(-,:C)C | \n",
+ " (W)-C#CC | \n",
+ " (W)-cC(-,:c)=O | \n",
+ " (W)-cN(-,:C)C | \n",
+ " (W)-CC(=O)OC | \n",
+ " (W)-CC#CC | \n",
+ " (W)-cI | \n",
+ " (W)-CNC(-,:C)=O | \n",
+ " (W)-cC=Cc | \n",
+ " (W)-c-n(-,:c)c | \n",
+ " (W)-cnn(-,:c)C | \n",
+ " (W)-cnnc | \n",
+ " (W)-cP(-,:c)c | \n",
+ " (W)-CS | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6288 | \n",
+ " a5e81bc8-7cd1-4603-bf77-d6635351f25b | \n",
+ " CC1=CC[C@@H](C2(C)CC2(C)C(=O)OC(C)(C)C)CC1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " [0.9202703349311558, 0.9234971350037704, 0.926... | \n",
+ " 0.567877 | \n",
+ " 0.541339 | \n",
+ " 7.301515 | \n",
+ " 0.532957 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 0.519961 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 40.776154 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ " 6289 | \n",
+ " 427ee4cc-e725-4b40-829d-51b14984e029 | \n",
+ " Brc1cnc2ccccc2n1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " [0.6751178997222926, 0.6719449948267988, 0.668... | \n",
+ " 0.567877 | \n",
+ " 0.541339 | \n",
+ " 0.540732 | \n",
+ " 8.894128 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 16.775000 | \n",
+ " 0.519961 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ " 6290 | \n",
+ " dcf80c78-98e2-4ef9-b8a8-bc259aa8f9a4 | \n",
+ " CCCC[Sn](CCCC)(CCCC)c1cnc2ccccc2n1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " [0.5719682556458464, 0.5722615937762792, 0.571... | \n",
+ " 0.567877 | \n",
+ " 0.541339 | \n",
+ " 0.540732 | \n",
+ " 8.894128 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 0.519961 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " fn \\\n",
+ "6288 a5e81bc8-7cd1-4603-bf77-d6635351f25b \n",
+ "6289 427ee4cc-e725-4b40-829d-51b14984e029 \n",
+ "6290 dcf80c78-98e2-4ef9-b8a8-bc259aa8f9a4 \n",
+ "\n",
+ " cano_smi CO cOC COC(-,:C)=O cnc \\\n",
+ "6288 CC1=CC[C@@H](C2(C)CC2(C)C(=O)OC(C)(C)C)CC1 0 0 1 0 \n",
+ "6289 Brc1cnc2ccccc2n1 0 0 0 1 \n",
+ "6290 CCCC[Sn](CCCC)(CCCC)c1cnc2ccccc2n1 0 0 0 1 \n",
+ "\n",
+ " cCl cO CCl CC(-,:C)=O cBr c[N&+](=O)[O&-] cC(-,:C)=O C=CC cN \\\n",
+ "6288 0 0 0 0 0 0 0 0 0 \n",
+ "6289 0 0 0 0 1 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " cC(=O)OC COC CF CBr coc cF CC(=O)O c=O c[n&H1]c csc cC=O CNC \\\n",
+ "6288 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "6289 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CN CN(-,:C)C cC#N cn(-,:c)C cC(=O)O CC=C(-,:C)C CC#N cNC(-,:C)=O \\\n",
+ "6288 0 0 0 0 0 1 0 0 \n",
+ "6289 0 0 0 0 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " cNC C/C=C/C CC=CC C=C(-,:C)C C#CC cC(-,:c)=O cN(-,:C)C CC(=O)OC \\\n",
+ "6288 0 0 0 0 0 0 0 0 \n",
+ "6289 0 0 0 0 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CC#CC cI CNC(-,:C)=O cC=Cc c-n(-,:c)c cnn(-,:c)C cnnc cP(-,:c)c \\\n",
+ "6288 0 0 0 0 0 0 0 0 \n",
+ "6289 0 0 0 0 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CS spectrum (W)-CO \\\n",
+ "6288 0 [0.9202703349311558, 0.9234971350037704, 0.926... 0.567877 \n",
+ "6289 0 [0.6751178997222926, 0.6719449948267988, 0.668... 0.567877 \n",
+ "6290 0 [0.5719682556458464, 0.5722615937762792, 0.571... 0.567877 \n",
+ "\n",
+ " (W)-cOC (W)-COC(-,:C)=O (W)-cnc (W)-cCl (W)-cO (W)-CCl \\\n",
+ "6288 0.541339 7.301515 0.532957 0.53604 0.528167 0.526322 \n",
+ "6289 0.541339 0.540732 8.894128 0.53604 0.528167 0.526322 \n",
+ "6290 0.541339 0.540732 8.894128 0.53604 0.528167 0.526322 \n",
+ "\n",
+ " (W)-CC(-,:C)=O (W)-cBr (W)-c[N&+](=O)[O&-] (W)-cC(-,:C)=O \\\n",
+ "6288 0.521537 0.516949 0.519961 0.518953 \n",
+ "6289 0.521537 16.775000 0.519961 0.518953 \n",
+ "6290 0.521537 0.516949 0.519961 0.518953 \n",
+ "\n",
+ " (W)-C=CC (W)-cN (W)-cC(=O)OC (W)-COC (W)-CF (W)-CBr (W)-coc \\\n",
+ "6288 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "6289 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "6290 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "\n",
+ " (W)-cF (W)-CC(=O)O (W)-c=O (W)-c[n&H1]c (W)-csc (W)-cC=O \\\n",
+ "6288 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "6289 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "6290 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "\n",
+ " (W)-CNC (W)-CN (W)-CN(-,:C)C (W)-cC#N (W)-cn(-,:c)C (W)-cC(=O)O \\\n",
+ "6288 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "6289 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "6290 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "\n",
+ " (W)-CC=C(-,:C)C (W)-CC#N (W)-cNC(-,:C)=O (W)-cNC (W)-C/C=C/C \\\n",
+ "6288 40.776154 0.50673 0.505772 0.505879 0.505348 \n",
+ "6289 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "6290 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "\n",
+ " (W)-CC=CC (W)-C=C(-,:C)C (W)-C#CC (W)-cC(-,:c)=O (W)-cN(-,:C)C \\\n",
+ "6288 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "6289 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "6290 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "\n",
+ " (W)-CC(=O)OC (W)-CC#CC (W)-cI (W)-CNC(-,:C)=O (W)-cC=Cc \\\n",
+ "6288 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "6289 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "6290 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "\n",
+ " (W)-c-n(-,:c)c (W)-cnn(-,:c)C (W)-cnnc (W)-cP(-,:c)c (W)-CS \n",
+ "6288 0.501979 0.503132 0.502398 0.502922 0.502398 \n",
+ "6289 0.501979 0.503132 0.502398 0.502922 0.502398 \n",
+ "6290 0.501979 0.503132 0.502398 0.502922 0.502398 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loaded_train_df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXxkZZno8d9TlX1PutNbet9paKCb0OyyDnSDIzKDjjjCDOogCuq9VwfxOpt3NscZvTNeUS7ei6h3hHFwoRWQTUGFhibQ+77Qna07Saez71X13j/OklOVSlKVVJJTxfP9fPrTVadOVd6cJE899bybGGNQSimVuQIz3QCllFJTSwO9UkplOA30SimV4TTQK6VUhtNAr5RSGS5rpr7w7NmzzdKlS2fqyyulVFp66623zhhjKpN5zowF+qVLl1JTUzNTX14ppdKSiJxM9jlaulFKqQyngV4ppTKcBnqllMpwGuiVUirDaaBXSqkMN26gF5FHRaRZRPaO8riIyDdE5KiI7BaRjalvplJKqYlKJKN/DNg8xuNbgFX2v3uAb0++WUoppVJl3EBvjPkNcHaMU24Fvm8srwNlIjI/VQ1USim/MMbwo5o62nsHZ7opSUlFjb4KqPPcr7ePjSAi94hIjYjUtLS0pOBLK6XU9NnX2MkDT+7ma88fnummJCUVgV7iHIu7m4kx5hFjTLUxprqyMqkZvEopNeMa2vsA2NPQMcMtSU4qAn09sMhzfyHQmILXVUopXznd0Q9AblZ6DVhMRWu3AnfZo28uBTqMMadS8LpKKeUrTZ1WoM/PCc5wS5Iz7qJmIvI4cA0wW0Tqgb8GsgGMMQ8DzwA3A0eBXuDuqWqsUkrNpL6hMABD4cgMtyQ54wZ6Y8wd4zxugPtS1iKllPKpgZAV4AdD6RXo06vQpJRSM8gJ8AMa6JVSKjM5AX5gSAO9UkplpMGQVaMfzLQavVJKvdt96ad7WF9V6mb0oYgGeqWUyhjdAyH+/Y1aAC5fMQuANIvzWrpRSqmxvNPS4952MvqIiTv537c00Cul1BiOtXS7twc10CulVOZxAn1hTpABuzM2kl5xXgO9UkqNxQn0oYgZLt2kWaTXQK+UUmM41mzV6AdCEXf8vJZulFIqQ4TCEd5pHe6M7R0MARDWjF4ppTLD4aZuBkMR1s4rBqBn0KrRp1lCr4FeKaVGs6OuDYCLl1YAw5l8OM0ivQZ6pZQaxY7admYV5rC8sjDquNbolVIZ7WRrD3c88jrN9iYcmezt2jY2LC4jKxgdKnVmrFIqo/1g20m2HW9l667M3jH0TPcAx1t6qF5aQXZgeGvs/OygZvRKqcx2pntgppswLWpOnAXg4qXlBL2BPkcDvVIqwzV3WYG+eyA0wy0ZXVf/EMu/+DTP7pn49tXbjrWSnx1kfVUZ2Z7SjZXRg0mjYK+BXimVMGMMh5usmaK99lBDP2po7yNi4GsvHJ7Q840xvHK4hUuWV5CTFSArOJzR52UH7HNS0tRpoYFeKZWwurN9bunGmTzkR87iY56KS1KOtfRworWX69fOASAr4Mnoc4JAeg2x1ECvlErY27Vt7u3eAf9m9D122wIysUj/0oEmAK4/Zy4AWTGdsZBeQyx14xGlVMLeOtlGUW4W80rzfF266bH7D/LsoJys14+3smpOEQvK8gFiSjd2oE+jIZaa0SulEvbWyTYuXFRGUW4WPT4u3Thty59AoDfGsKehg/MXlrnHYjtjIb0yeg30SqmE9AyEOHi6k42LyyjMDdLn64zeaptTT0/G6c5+znQPcv7CUvdYVszwStAavVIqA+2qaydiYOOScvKzs9wFvvzI6Sh2RsgkY099BwDnVXkCfZyM3mjpRimVaZyO2A2Ly+2M3r+lG2eMf15W8hn93oYOAgLr5pe4x3I8gT5PSzdKqUy17Xgra+YWU5qfTUGOfzP6/qEwbT2D1p0JDLo51tLDklmFUWUfb2eslm6UUhmpZyDEm++0cfWaSsDaP7Wla8Atc/jJ5V/5Fd/bdhKY2JZ/jR19LCjLizrm7Yx1PiVoRq+UyihP7WxkMBzhpnOtceWFudbI7N//5u8m9bqN7X189okddPUPTbqNYAX2s042T2KbeA+EwkQihqPN3W6bFpTmR53jLd3k5wTsr5WCBk8THUevlBrXL3Y3sqKykI2Ly4GJj0+P9fArx3hqZyNXrJjNBy9eNOnXO9MTveDaeOUVYwy3f3sbexqsTyY/u+8KmrsGmF8WHeijSjdao1dKZZrW7gHeeOcsW86bj9gzTfuHUlOfb2zvA2CCE1hHaOqIDvTjlW5ePtziBnmA3x5uwRiYXzp66SbXLt2k076xCQV6EdksIodE5KiIPBjn8VIR+bmI7BKRfSJyd+qbqpSaCd999QThiGHL+nnuscLc1GT0De3W5iUDocnVQYwx/PpQM3VtvVHHx8q6+wbDfO35Q1HHTrRazx8Z6IffiQL2mPo0SujHD/QiEgQeArYA64A7RGRdzGn3AfuNMRcA1wBfE5GcFLdVKTXNjDE8vr2W69bO4dwFw+PK/6h6MQDFeZOr/p7qsDL6yX5C+MXuU9z93Tf5x2cPRB0Pj/H+8YPXT7C3oZMrV852j/347XoAFlcURJ3rzeiduVOG9In0iWT0m4CjxpjjxphB4Ang1phzDFAs1ue6IuAs4N9BtkqphBw/00Nrz6DbCesoLcjmjy9ZTG7WxKu/g6EI7b1WJ+xkM3pnt6u6s31Rx8fK6F8+1MLaecXcsWlx1PEVlYUsmx29R6w30DtlpjSq3CQU6KuAOs/9evuY1zeBc4BGYA/wWWNGzhsTkXtEpEZEalpaWibY5OkXiRjae4d78o82d/Nn36/hl3tPA/Dq0TNsO9aaVhsRqHePjr4hmjr72XasNannGWPY/o6zy1LFiMezgwF3OeCJqD07XGaZTEbf2j3Arw82u/ffe/58PnnNCkrzs0cN9G09g9ScaOOqVbMpL8iOeuyGdXPdvghHVOlGnNJN+vy9J/K5K143Sex3eBOwE7gOWAG8ICK/NcZ0Rj3JmEeARwCqq6t9eZX6h8JkBYQn36qnpWuACxeX8b3XTvDigWZe/G9X8/6HXnVn3b2wvynquSsqC3n6M1elbESCUhPRNxgmOyhkBQNs3dXIf3lih5t9/ugTl7Fp2XDQjkSMW3P2+uXe0zzw5C6WzCpkdlHuiAwXICcrwFB44n/G75zpcW9PJtBv3dVIyJNef+I9K1i/sJRtx1pH7TD9/raTDEUifLB60YjvoShnZFiMDfyQXhl9IoG+HvCOe1qIlbl73Q18xVhvcUdF5B1gLbA9Ja2cAsYYRIRIxFDX1svOuna++stDNLT3jfqcG77+ypiveaylh//9ynE+e8OqVDdX+djB0538yaPb+dYfb+SiJSMzX8fehg5+ufc0d162hLkleaOel4hwxPC3v9jPjto2bttQxdZdjTR3DdAzEKKtd4jsoLCgLJ+TdufihYvK2FnXzgNP7mLrp6+kJC+b7oEQ13/tZe66bCn3Xbsy6vW/+txBOvtD7Gno4Ob18+IGuuygMDRWEXwcJ+xAHxDom0Sg/8+aes6rKuG95y/gK88e5Jz5xQAEAzJqRv/SwSYuWlzOqrnFnO7oj3qsIHfssDi8xn36RPpEAv2bwCoRWQY0AB8CPhxzTi1wPfBbEZkLrAGOp7Kh4znTPcDpjn7WzivmH545yOq5RZxXVcrtD79G/5D1y/iz+66grWeQH9XUsbu+gzs2LeJfnp/YVmPfuauadQtKiEQMc0py+d5rJ3h272n+54uHuXhZOZevmD3+i6i019U/xCuHWmjqHODPn9zN+VWlzC3J44HNaxFgMBxh/6lO7vv3tzllB5Rv/vooJ75yC2AFu8FwhNVzi8f8OgOhMLlZQerO9vLV5w7x813DudYue3bqoop82nqHWD67kLkleeyoa+PipeU8cmc15YU5/OpgEx99rIZf7DrFhy9ZzN/9Yj9NnQP883OHuHR5hfsmdbZnMCrbjle2Aat0E4oYN2lKxMOvHGP13CKuWzuX42d6qCjMITcr4P6NJuJ4Szf/61dH+cc/WE/d2V72n+rkf9x6LnddtpR7r17hnhcUiZvRt3YPsLu+g8/fuBqAspjSTeE4K16mY41+3EBvjAmJyP3Ac0AQeNQYs09E7rUffxj4W+AxEdmDVer5gjHmzFQ0OBSO8OaJNh7fXut2wMwqzKHVMxtuNO9/6NWo+94g/9ErlvHoq++4969aNZvzqkq579qVhCOGp3Y28NVfHuJfPnABC8vzo1a2A7jnPSu4fMVs3vu/fse3Xz6mgT6DvX68lYdfOcYN58zlL3621z1+vKWH4y1WgPzfvxk7z1n64NP824cu5LNP7ATgnX+8mWMt3ZQV5NDcOUBhbpAfvlHLjrp2tr9zlmBA+OKWtfzDMwfcAHPJsgr+38cv4YX9TcwvzWPD4nL6h8Kjlg6vXTOHxRUF/KimjoDAE2/WsbA8n/q2PrbubHQD/cuHmqOGDlaP8inF6aAcChtyssYP9LWtvXzl2YMAnPjKLdSd7WVRRQHtvYOEkvhkcMs3fkffUJiPXLqYd85Yn1ji/b2JxA/GB093AURN/vru3Rdz93ffBEbP6L9xxwYKsoMM2m1NoxJ9YjNjjTHPAM/EHHvYc7sRuDG1TYvvJzsaeODJ3QQ9dcXWnkHKCrLdHvwrV87md0et95nPXL+Kb7x0ZMTrbFxcxr7GTr50yznceekSRIT7r1vJD7ad5FPXrojqZQe467Kl7nmjOa+qlM9ev4p/e+kILV0DVBbnpuJbVlPguX2nOWdeCYtnFYx6Tjhi2NfYQTAg7Kxr58ObFrOrvoMPPfI6YI3aiGfD4jJ21LYDsKA0j0ZPaeCT16zg2y8fA3CDPMCyL0b9eZGTFd3RGY4Y/u7pA5QVZPOdu6pZUlFAaUE22cEAN6+f7543Vv+QiHDnpUv4+2cO0NxptWnr/Vfyse+9ydGWbve87e+cjfp7ckohsZwOyqFwhJwERt88v/+0e7tvMEx9Wy/nVZXSMxAas9bf0N7HkaYurlkzh8b2PrfM09I1wP7GTvKyA3H7EAIihOKsU3C4yQr0K+cWuceuXTPHvV0wyjV83wULAHh2zykgvWbGpt0SCNeumcNDH97IdWvnuGtNO8G3Z8CqKV6yrILfHT3DZctnkRUM8OnrVtLY3kdFYQ5NnQOsnGP9gMMRE/WGUVGYM2Z9PZGPpzeeO5d/e+kIj732Dn9+09rJfKsqAcYY3q5tY3ZRLosrCqJ+RkeaujjW0s2CsnxCEcPehg66+kPUnDjLr+0gveW8ecwvzaesIJvbL1rIl366h0Onu8jPCXKspSfqaz35Vr0bwB03nTuX5/Y1sWZuMZ+7cTXLZhfS2NHPvT94i//4xKXuLkX9Q1ZgW1FZxK0XLmBgKMJt33p19I//Bu6/diV1bb08sHktdWd7eeQ3x/nUNSuoHqWUkohLl88CoLGjn/uvXUlFYQ4LyvLZ3zg8buLg6S7WzivmTy9fRkN7X9Ra7F7DGX1i2bh38MKhpi4a2/vZfN58jjZ3u1lyPB98eBsN7X289Rc38OaJs+7x5/c18ZMdDZy/sDTq79gRDAjxFtg80txNWUE2lUXxE7GCcSaDiTvqZszTfCXtAn1lcS63nD8/7mOFuVnuL/JVqyrd49nBAEtmWe/4xXnD9bh4vxyTde6CUq5eXcmPaur5rzesHvWPRCUvHDF857fHOb+qlJbuAXbUtvPYayfcx7ODwrzSPApzsjDGygSdEVKxnIz12b3DWebXXxi7v2ZHbTvFeVnceekSHtg8/Cb+1smzLCwvcDtYV80tZt+Xb4oazZKXHWTlHCszXjvPWuf8yN/fzN6GDm61S4qP/mk1pfk5rLfLgt4suaos3/3dnoy1nuz8hnXW2Pj5JXm8dKCJUDhCQITDTV18sHoRm8+bN9rLAMOBfqwg7WjvHaTmZBu3rJ/P03tO8erRMwyGI1SV55OTFRi1dHOqo88dILG7oSMq0P9kRwPAiAXIHFbpZmQ0PtXex8Ly/FETt4I4o25iXxc0o3/X+8ilS/iz79fw7N7T/L79cc8vTnf0U5gbjHrD87PG9j4+8/gOsoMBth2PPw78kmUVLK8s4oX9p6MmzGxaVsF7Vs3mwKku5pbksWx2ATedO4+wMcwvzec/a+p4bl8TZQXZnDjTQ83JNu66bAl3X7GMeSV5hCIR9zq9dfIszZ0DbFk/MsmIN9Im3pDFWMGAcMGiMq5bO4f23kGuWzt33OdMVnYwwG0bqjja3M359hvKrKJc+ocirPzSs3z4ksX0DoZZMadonFcaXtExkSGWLx9qIRwxfPTKpbx4oImXDljZ/cLyfLICMuprPLF9eArP3d99k42Ly1g5p8hdaRLgS7ecE/e5AXtUXay23iHKC0afuF80TkYfSNXCPNNIA/0UuH7tHOaW5PLzXY2+CfR7Gzr4p18e5LdHzrBkVgEv/rerR/RDpJIxBmMSC3hd/UP8qKaekrwsOvtDtPUMcqipi4a2PurbeunsD5GbFaA4N4vV84opycvi9osW0dU/xPs3VLl16X/8g/UYY+gfiiS0V+gHqhfxgWpr5HA4Yk0OunhpuedT2PBrjDVscrK+c1f1tGaHX//gBUQ8P5tZRcNB74dv1AJWlj+ebLsDdiiBSVNbdzUyrySPDYvKmVuSx/5TVqloTnGuNfEqTkbfPRDie9tOcP3aObxkT4h6u7adq1dXuoH+z29aw6KK+P0s1vDKkcfbegdHLHHgVTjO8ErnN1oz+ne5QEDYct58fri9lu6BEEXj/OJMpRf3N/G1Fw5zwP7DKivI5mRrL68da+Xnuxr5/I1rmFc68THdbT2D/HRHA6vnFnP5ilk88WYdP9vZwFsn2zDGsHJOETevn08kYthV38GRpi7ycoJUleVztmeQgVAkKjuLtd4e+TReGcEhIhPaEDoYEC5bMfnSyEQEA0JwIlshTZCI4JnoyeyikdntnJLxBxJkBaw3xHgdnl6t3QO8criFj1+1jEBAKCvIdmfFluRlk5MVoCdOie2J7bW09w7xmetX8f4NVXz68R1W2zyDHErzR/9kGvCUbo63dPP3Tx+gOC+Lk629XLO6ctTnjVe6sb9trdEraxr2Y6+d4KUDTdx6YeyKEVNjKBwhOxigtXuAf3vpCDtq290lWP+oehGfv2kNDe19vP+hV/n684fYVd9BdjDA525czWOvnuBT164Y95f8bM8g//riYYpys9hd38Hexg53dIZj9dwiPli9iJK8LLbuauRfXzyCiFVLvXBxGUebuznR2sPSWYV09g3xBxuq+ED1IiqLc8gJBokYw9LZhaPO2lSpNatwZFBPZEKXW6MPjR3xnt5zinDEcNsG6+/AG5xL8rNHLd381O5ovWBRGSWe58zydKKWjBHoxTOO/j/erHM/FQCUxSnd3LFpEY9vrxt/HL39pjxeRn+0uZslswqm9JNzojTQT5ENi8spyAmyo7Z9ygK9MYbewTDbjrXys50NPL+viUF7qNtgKMKC0jw+fuUyPnnNCvePw1lxz5lks/9UJ99/7QTf/PVRFlXk80cXRy/wVN/WS21rL5etmMXR5m7uenS7O/EHoCg3i2/98UaaOvt5evcpqpdW8IXNa9yOrgc2r+WN462snlfM7FFGOYxGg/z0mBWT0YtYc1PG44ydH2/Uzc92NLBmbrHbCV3oSSaKcrPIDgZGvMapjj5r+PPNVv29xLNKprdtY62eGRRxs+7YJRYq4nx/X7z5HD7xnhXjDqBwSvRjhfkTZ3q44euvcO/VK3hwy8yPvtNAP0WCAWHtvGK3ZJJqbT2DfPrxHe58Aa+cYICPXrGMz924ekQ2MTsmezt8uosVldaIpMb24QBujGF3fQd3Pbqdjr4hygqszMsYePLeyzje0sON586lKDfL/cO4+4plI9oSDAiXr9TJY34Wm9EXe36mY0lkeGVtay9v17bzwOY17jFn+GJxbhbBgJCdNTLQ76qzhrFWL7UmNXkHD3iDdMkYgwoCgeGsu6kzekOS8jiBviQve8zXc0gCi5q9XdsGwLN7T2mgz3TrFpTw1M7GpKaIJ+K3R1q48/9aywitqCzkz65azm0bq8gJBsb9Ot4s+cZ1c3l+fxNvHLeGrDmbNhhj+MKPd/OjGmtt7jVziznU1EX1knL+8r3ruGBR2aTGcit/ie3TGK8z0pHI8Mqf77Zmr7/PMyjByeidbDw7TulmZ10H2UHhnPnWpwDvUNOKIm+gH72tIsNr3dS3R29Ikj+JhQfd9ejHSOmdkul4pdDp4o9WZKhz5pfw/16vpb6tb9SRAYlo6xnEYGUyP3j9JH/1lDXt3lnfI1nFeVl09Yf4YPUint/f5I5TrreHJn7llwfdIP8f91zKJctn0dDex7ySvCmZe6D8pSDBzuzhmbGjR7xXDrWwvqqUheXDv//O6ztZerzSze76ds6ZXxJ3pq+3dDNWjT4g1qiburO97G/sZNOyCjYtraClayBqs5FkDdfoRz/HWbCtb9Af23JooJ9C6+xsZF9j54QC/e76dt73TWsyTXFuFhcuLuO3R85w+YpZfPX286P+eJLx009dztmeIRaUDXe4bVhcRl1bL0eauvjOb45z+0UL+eofnu9+Aqgqiz8pRWWeREeJuaWbUYZX9gyEeLu2jY9ftTzquJPlOiWc7JjljgdDEXbWtfOHGxfGfd2KhGv0VunmSHMXEQNf2LwmJcNkhzP60SO9M6qoN97U3Bkw893BGWztvBICAvsbO8Y/OcbTu09x27deA6wgn5cTZNuxVj593Uoe/dOLJxzkAVbOKWbTsgqqyvKt0Tg3ruaqVZWc6ujnvh++TWFuFv/95nO0M/Rd5OGPbHRvJ1pucAK9M7zy4OnOqMx8+4mzhCJmRPbs7DebZf9+WaWb4efd98O36R0Mc82a+EMgvX0KY5VgAnbpprPPyqrHmiSVlHFWr7SWPrc+HYfinDTehuVTQTP6KZSfE2TVnOKoXebHc7ZnkM//5y5+dbCZtfOK+Z9/dCEr5xRhjDWBJN5ogYkSEf7p9vMB+M8aawbi4aZu/vbWc1P6dZT/bT5vPrdeuICndjYmvPG3E+gHQhH2N3Zy8zd+ywOb1/Cpa6y17d84fpbsoLgdqg7njcQpA3pLNz0DIX51sJkt583jurVziCc/J8gPP34JB093jdknZe03AZ391vDfscbcJ8PdYWqUcTct3QMMhiJx1+uPRAzve+h33LZhIR+7cuTghamiGf0UW7+wlN31HQlvO/Yvzx/iVwebWVCax3fuquac+SVkBwPkZAWmNPjOscdNX7umkjsnUPdX6c9Z0iDRjN45PxQ2NHVZI7a82xXurGuLW2d33kjcQJ8VIGSXbg6e7iIcMfzhxoUjgvgVK4cntF2+cjYfHSdQBu1RN519VqBP1bIf7rYjo/xJO2Wb5bOL3O/L8auDzext6Iw7SW0qaUY/xS5YVMaTb9WP2yHb2T/EX/x0L1t3NXLnpUv48vvOndbSyeUrZvG531s9oc5dlRmckS0Jj7qxx9EPhiPukspOBhuOGPbUd/CHF42sszvllqA9xTQ7IAyGIxhjaOmyhkHGm639g49eQjiJ6ahO6aajb4j87GBCSykn9LqBsVevrLV39Vo2u5DjZ6JnfT/ym+NUleVHLS09HTSjn2IbFlnL1O6sax/zvCdr6tm6q5GsgLUu/nTXx7ODAT59/SpKC9JjsTOVem6gT3jUzfA4+q7+UNSxYy3d9AyGucBeptnLKVHnBCXqOY+9dsLdjDzeXg6BgCQ1y9SaGQudfSFK8lOX0wbcGn38SF97thcRWDq7kKGwcT/NH2/pZvuJs9x12ZJpny2rGf0UWzOvmNysADvr2kdd4GwoHOFHNXWsnVfMjz95ecIZlVKp5JZuEvz9c94YBkMRwhGrPJKbZb1JOOsXrZk3ctMSZ1kCZ6Z0tv06X/75fvecVJQpgwFrZExn/1BCE6ESN/YSCHVne5lXkucOIw1HDFlB4Xl7Pf73b5ieJVG8NKOfYtnBABcsLOOtk21xH49EDJ95fAcHT3fxiauXa5BXM8YJwIlm9N5lip1hhM7Y+uP2jlXLK0fu/HTjuXP51DUr+KK9vEFWzKfXcnvnrMlyR930D6WsI9Z6Xev/0YpIR5q7WVFZRJZ9LZyRNztq21g6q2DSG8NPhAb6aXDxsnL2NHTQ2N434rFf7DnFs3tP8+CWtdy2If64YaWmgzPDdaztCL28pRunRu9kucdaelhQmhe3Yzc7GOCBzWvd4BtbO5+V5JpIo3EmTFmlm9QF+rGWQDDGcLS5m1Vzi8gODF8fYww7atu5cNHIUtZ00EA/DT5YvYigCB//Xg2RiKF7IMSpjj46+oZ46FdHWV5ZyD0xk0qUmm59dlbubNE5nmBACIgVyJxOWGfi0/GWbpZXjr95CTByPaYUjUgRsT4xW6Wb1Nfo41VuWroH6BsKs3RW4XBGHzac6uinuWuADYvLRz5pGmidYBosmVXIX773HP7yqX1s+oeXONMdvcDSX9yik5PUzHPiVjI7KDmbhjhBbzBkZa/HW3q4bWNitejY0k2qMvqgZ9RNSjN6zxII4Yjhrkff4E8uW8qN586jzh5aubiiwF07Kmxn82DNQJ8JGuinyYcvWUJdWx/7GzuZV1qJAFXl+QjCRy5dMtPNU4r7r11Ja/cANyW4yQtYdfonttdxsT0pajAcoWsgRNdAiIXliS2bEVu6mZ2i+SKBgBA2hr6+1Nbo3WWKjaGrf4hXj7ZypKmbG8+d546hX1RR4K4hFYkYdtS2kZsVcJdqnm4a6KdJMCD895vj722plB8snV3Id+/elNRzgkGhvXeIFw9Ym3pEIoYme7+CRDsdnZ2qHBVxNkKZCBHoH7JKSqkcdTO8OfhwqarDnpRV22oF94Xl+e6EsLAx7KhrZ31VacrG8idLa/RKqZQJG+Ou/Z5woA9Gl27yc1ITloKeElRqx9G7c2PdkUrOoVp7aGVedtD9+j0DYfY0dMxYRyxooFdKTULsAl3hiKGp08ro5yUY6LNjAv3quSPH3k+Et68hZQuaEZvRW58YnLp9XVWths4AABKeSURBVFsviyqskpWT0W87dobBUIT3jLFP7VTTQK+UmrDYkSehsKG1x8roY7coHI23dPPXv7+Oq1MUEL19vPFm2k78dYeXQIjN6E939DO/NDrQH2rqAuDcBTNTnwcN9EqpSYidHRoxhu7+ECLRe8OOxVu6ueGcuSnbjc07ki3eUgwT5bxqxBh3iWawOmdPd/Yz316nx/n6dWf7KMgJzuiKsBrolVITFru0eihi6OwPUZSblfCQYW9Gn5vgGP5EOJn3Levnp3T4sjthiuFZrwK09Q4xGIq4C7I5NfrugRCFuVkp3U40WRrolVITFruaZCRi6OoPJTXKxZvRJzorNxFObE/19pfe4ZXOMsQiwumY0UbOPLDu/tCk9qhNBQ30SqkJi10GIBQxdA8MJbwdIeAuFQCT27Q7lpNBpzrQe2v03oy+vXcQGF6QzVmGuXsgTQK9iGwWkUMiclREHhzlnGtEZKeI7BORV1LbTKWUH8V2xobtjH6svVxjeTP6VC7f6wT4lGf09v8RYwg5O0gJtNtj6cvspb7djH4gRH6CC8VNlXF/GiISBB4Cfg+oB94Uka3GmP2ec8qAbwGbjTG1IhJ/DzClVEaJ7Yx1An0y69XEDq9MFSe+xy6xMPnXHS2jtwK9M5Qz4I6jT4+MfhNw1Bhz3BgzCDwB3BpzzoeBnxhjagGMMc2pbaZSyo/idcZ29Q8ltW1f7MzYVHECbarXkRoeR2+itgps77NKN85yC84niVDEzHhGn8gVrgLqPPfr7WNeq4FyEXlZRN4SkbvivZCI3CMiNSJS09LSMrEWK6V8K2Ks1VmLJli6SSUn0Kc6o3c7Y8EdXikidPQOkZcdcDuUvTNzE10RdKok8tOId5ViF+jMAi4CrgfygW0i8rox5nDUk4x5BHgEoLq6OvHNH5VSaSEUjjAQilCcRGfsVGX0ToBJ/aib4fXonQlTAbFKN2X5wyUr79fNy/J5jR4rg1/kub8QaIxzzhljTA/QIyK/AS4ADqOUetcIRwwDoUhSwySnKqMP29l2MMXj173r0TuLmgVEaO8bdDtiITrQp3J+wEQk8tXfBFaJyDIRyQE+BGyNOecp4CoRyRKRAuAS4EBqm6qU8ru+IWvzkmQCW/YUZfTOgJhgit9IYtejB6sfoL03ejlkb99Art8zemNMSETuB54DgsCjxph9InKv/fjDxpgDIvJLYDcQAf6PMWbvVDZcKeU/TudsThLDJKc6o0/9qBvrf8PwEghO6Wbp7AL3PO8niYFQOKVtSFZChTRjzDPAMzHHHo65/8/AP6euaUopv7toSXncje9zfVC6cYY+BlP9icGzeqUz6iZol25K80vd07ylm87+UGrbkCSdGauUmrDH7r6Ya9aMXG0yN4kNNqaudDMchFMpMLwGQtSom97BMIWeTmjvMslDoQgzSQO9UmrCivOyWRNn/fhkAv1U7ZfsBPpUf2IYnhk7/KkhEID+oXBUJ7T367bZyyPMFN1KUCk1KU6JIsfeKByS73z85DUruG5taifUuxn9lM2MNTild2cEjncGrDej/9iVy1PahmRpoFdKTYozXj032xPokxxO+IXNa1PcKk+NPsWlG+8OU86iboN2acYb6L1vMJuT2HB9KmigV0pNiokz0iaZ0s1UmaqM3pkwFTHGXetnwA703hmwqX6DmYyZ/2kopdKaYWRAnelx4zB1NXrv+4YzYcoZPumt0U9RH/OE+KgpSqm0ZGf0WVGBfuZDi9tRmvLSzXBG77yZDLoZvacz1keR3j8tUUqlJSeg5niC+0wv4gXWblcwhROmPKNunIli+ZrRK6UykbP5hjeb9UPp5n0XLgBg07KKlL6udwkEd+MRm/ca+KlGr52xSqlJGbTr1LlRgX7mc8grVs7mxFduSfnrDi9TPFy6cXhHG6W6E3gyZv6noZRKa86aMnlZ3lE3M5/RTxWJU7pxeMtEUzURbCI00CulJmX9wjIAVswpco/N9LK8Uyl6wlR06cbbAZvqvoHJ0NKNUmpSPnLJYqqXlHOkuZsfvlELJLd6ZbqJtwSCw1uuSfVon8nI3J+GUmpaiAjnzC/xbdki1bybg8fW6L1j9rVGr5TKOH7KYKeSd3PwofDoGb2fRt1ooFdKpYSfMtip5O4Zy3BHtMOvn2q0Rq+USgk/dT5ONRG7M3aMGj3A7RctZMsML2gGGuiVUinipwx2qgVErOGV4djhldFFkn/5wAXT2axRaelGKZUSfqpJTzXBqtGPl9H7hQZ6pVRK+DXITYWAyLg1ej/RQK+USol3U6BHRsnop2ij88nSQK+USol3U6APCBC3Ru/Pa6CBXimVEu+mQC9I1Hr0Dr9eAw30SqmUeDd1xgbE3hB8RI3enyHVn61SSqUdJ5v1a/kilUSESJwlEPz6rWugV0qlhBvofdohmUoi1nr03hp9VkDcWbN+o4FeKZUSzoKV74YSjuCsRz9cuvFrfR400CulUiRo16f9HPBSJRCQEUsg+Pn71kCvlEoJJ5PPyuC16B3WzNjoGr2fP8lk/k9EKTUtSguyAaheUj7DLZl61sxYw2BouHTj4zivi5oppVKjND+bn37qclbNLZ7ppkw5ESuj7x8Ku8f8vKhbQhm9iGwWkUMiclREHhzjvItFJCwit6euiUqpdLFhcTlFuZmfP4pYNfoBT0bv541Xxg30IhIEHgK2AOuAO0Rk3Sjn/RPwXKobqZRSfhIQiESsjN6ZN+DjhD6hjH4TcNQYc9wYMwg8Adwa57xPAz8GmlPYPqWU8h1BGApHiBjIzwm6R/0qkUBfBdR57tfbx1wiUgXcBjw81guJyD0iUiMiNS0tLcm2VSmlfCEg0GfX5wvsQJ/uGX285puY+/8KfMEYE45z7vCTjHnEGFNtjKmurKxMtI1KKeUrIuIJ9FafhJ9r9In0mtQDizz3FwKNMedUA0/Y039nAzeLSMgY87OUtFIppXxEBPoGrUCfl+3/jD6RQP8msEpElgENwIeAD3tPMMYsc26LyGPALzTIK6Uylcjw0Mr87IB9zL+RftxAb4wJicj9WKNpgsCjxph9InKv/fiYdXmllMo0gTilGx/H+cQmTBljngGeiTkWN8AbY/508s1SSin/EqB/yBpDn+92xvo30usSCEoplaSAWMMrAXLstX38XKPXQK+UUskSGLLXonfW39eMXimlMkhAxF2L3t0+0L9xXgO9UkolS4CwndHnZGlGr5RSGScg4m4M7mT0WqNXSqkMIjK86YhTo3dLOD7k35YppZRPiYjbGZuT5f8tFDXQK6VUkrwhvdhef9+7f6zfaKBXSqkkeas0JfnWFoqDoTHXdJxRGuiVUipJ4snpS/KsQO+UcvxIA71SSiXJW44vybdKN85MWT/SQK+UUsnyjJnPz7YCvZPZ+1Hm7+KrlFIp5s3oF5bn87nfW837N1SN/oQZpoFeKaWS5B11EwwIn75+1Yy1JRFaulFKqSR5lzvw8/h5hwZ6pZRKkndZGz+vcePQQK+UUkkSzeiVUiqzRdXoNaNXSqnM4y3X+HgtM1caNFEppfzFG9y1dKOUUhnIuwSCdsYqpVQG8sZ2zeiVUioDRY260YxeKaUyjzeJD2hGr5RSmccJ7elQtgEN9EoplTSnAzYdyjaggV4ppZLmxPd0GEMPGuiVUippohm9UkplNie8p0NHLGigV0qppLk1+kwK9CKyWUQOichREXkwzuN/LCK77X+vicgFqW+qUkr5g1OxyZjSjYgEgYeALcA64A4RWRdz2jvA1caY84G/BR5JdUOVUsovnIw+k0o3m4CjxpjjxphB4AngVu8JxpjXjDFt9t3XgYWpbaZSSvlIpmX0QBVQ57lfbx8bzceAZ+M9ICL3iEiNiNS0tLQk3kqllPKRTKzRx/tOTNwTRa7FCvRfiPe4MeYRY0y1Maa6srIy8VYqpZSPDI+6mdFmJCwrgXPqgUWe+wuBxtiTROR84P8AW4wxralpnlJK+U8gA0s3bwKrRGSZiOQAHwK2ek8QkcXAT4A7jTGHU99MpZTyD0mzzthxM3pjTEhE7geeA4LAo8aYfSJyr/34w8BfAbOAb9kXIGSMqZ66Ziul1MxJt+GViZRuMMY8AzwTc+xhz+2PAx9PbdOUUsqfnB2mMqkzVimllIcT39NhG0HQQK+UUklzSzea0SulVGbKxJmxSimlPIY7Y2e2HYnSQK+UUkmSDJwZq5RSysOdGaudsUoplZkyca0bpZRSHjrqRimlMpw76kZLN0oplZk0o1dKqQznLIGgGb1SSmUod5niNImgadJMpZTyDy3dKKVUhtPOWKWUynSa0SulVGZzJ0xpRq+UUplpeHNwDfRKKZWRNKNXSqkM58R3zeiVUipDDS9TPMMNSVCaNFMppfzDyeO1dKOUUhnKqdGLBnqllMpMbo1eA71SSmUmXetGKaUynGjpRimlMluWndKHI2aGW5IYDfRKKZWkoAZ6pZTKbE6gjxgN9EoplZE0o1dKqQznDKtMkzivgV4ppZLllm7SJNInFOhFZLOIHBKRoyLyYJzHRUS+YT++W0Q2pr6pSinlD06gD2VKoBeRIPAQsAVYB9whIutiTtsCrLL/3QN8O8XtVEop3xgu3WRIoAc2AUeNMceNMYPAE8CtMefcCnzfWF4HykRkforbqpRSvlKSlzXTTUhIIq2sAuo89+uBSxI4pwo45T1JRO7ByvhZvHhxsm1VSilf+P0L5nOkqYv7rls5001JSCIZfbw5vrGfVxI5B2PMI8aYamNMdWVlZSLtU0op38nNCvLFm8+hJC97ppuSkEQCfT2wyHN/IdA4gXOUUkrNgEQC/ZvAKhFZJiI5wIeArTHnbAXuskffXAp0GGNOxb6QUkqp6Tdujd4YExKR+4HngCDwqDFmn4jcaz/+MPAMcDNwFOgF7p66JiullEpGQl3GxphnsIK599jDntsGuC+1TVNKKZUKOjNWKaUynAZ6pZTKcBrolVIqw2mgV0qpDCdmhtZqEJEW4OQEnz4bOJPC5kwHbfP00DZPvXRrL2RWm5cYY5KacTpjgX4yRKTGGFM90+1IhrZ5emibp166tRe0zVq6UUqpDKeBXimlMly6BvpHZroBE6Btnh7a5qmXbu2Fd3mb07JGr5RSKnHpmtErpZRKkAZ6pZTKcGkX6MfbqHymiMgJEdkjIjtFpMY+ViEiL4jIEfv/cs/5X7S/h0MictM0tfFREWkWkb2eY0m3UUQusr/Xo/am8PE2npnKNv+NiDTY13qniNzsszYvEpFfi8gBEdknIp+1j/vyWo/RXt9eZxHJE5HtIrLLbvOX7eO+vMbjtHnqr7MxJm3+YS2TfAxYDuQAu4B1M90uu20ngNkxx74KPGjffhD4J/v2OrvtucAy+3sKTkMb3wNsBPZOpo3AduAyrJ3FngW2THOb/wb4fJxz/dLm+cBG+3YxcNhumy+v9Rjt9e11tl+/yL6dDbwBXOrXazxOm6f8OqdbRp/IRuV+civwPfv294D3e44/YYwZMMa8g7WO/6apbowx5jfA2cm0UaxN30uMMduM9Rv3fc9zpqvNo/FLm08ZY962b3cBB7D2UPbltR6jvaOZ8etsLN323Wz7n8Gn13icNo8mZW1Ot0A/2ibkfmCA50XkLbE2QQeYa+ydtuz/59jH/fR9JNvGKvt27PHpdr+I7LZLO87Hc9+1WUSWAhuwsjffX+uY9oKPr7OIBEVkJ9AMvGCM8f01HqXNMMXXOd0CfUKbkM+QK4wxG4EtwH0i8p4xzvXz9+EYrY1+aPu3gRXAhcAp4Gv2cV+1WUSKgB8D/8UY0znWqXGOTXu747TX19fZGBM2xlyItUf1JhE5b4zT/dzmKb/O6RbofbsJuTGm0f6/GfgpVimmyf6Yhf1/s326n76PZNtYb9+OPT5tjDFN9h9MBPgOw2Uv37RZRLKxgua/G2N+Yh/27bWO1950uM52O9uBl4HN+Pgae3nbPB3XOd0CfSIblU87ESkUkWLnNnAjsBerbX9in/YnwFP27a3Ah0QkV0SWAauwOldmQlJttD8Od4nIpXZP/12e50wL5w/ZdhvWtfZNm+2v8X+BA8aYr3se8uW1Hq29fr7OIlIpImX27XzgBuAgPr3GY7V5Wq7zVPQuT+U/rE3ID2P1QH9ppttjt2k5Vu/4LmCf0y5gFvAScMT+v8LznC/Z38MhpnAESEw7H8f6aDiElRV8bCJtBKrtX8ZjwDexZ1hPY5t/AOwBdtt/DPN91uYrsT5K7wZ22v9u9uu1HqO9vr3OwPnADrtte4G/so/78hqP0+Ypv866BIJSSmW4dCvdKKWUSpIGeqWUynAa6JVSKsNpoFdKqQyngV4ppTKcBnqllMpwGuiVUirD/X+RFfLtdUsCWgAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.plot(loaded_test_df.iloc[-1:]['spectrum'].values[0][::-1])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Dataset composition"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### These datasets contain NIST `basic information` & KIT complat in-house `spectra`.\n",
+ "\n",
+ "Since we are not in the position to distribute NIST spectra, saved dataframes in this repository only have: \n",
+ "\n",
+ "1. NIST filenames and canonic SMILES without spectra.\n",
+ "2. KIT complat in-house filenames and canonic SMILES with spectra in the `spectrum` column.\n",
+ "\n",
+ "- Therefore, we can know how many NIST and in-house SMILES by checking the `spectrum` column.\n",
+ "- After you get NIST spectra, NIST spectra will be loaded in the next notebook.\n",
+ "- To purchase NIST spectra : __TBD__."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training dataset count 4819 / NIST 3930 / complat 889\n",
+ "Validation dataset count 733 / NIST 557 / complat 176\n",
+ "Testing dataset count 739 / NIST 579 / complat 160\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Training dataset count {} / NIST {} / complat {}'.format(loaded_train_df.shape[0], loaded_train_df['spectrum'].isnull().sum(), loaded_train_df.shape[0] - loaded_train_df['spectrum'].isnull().sum()))\n",
+ "print('Validation dataset count {} / NIST {} / complat {}'.format(loaded_valid_df.shape[0], loaded_valid_df['spectrum'].isnull().sum(), loaded_valid_df.shape[0] - loaded_valid_df['spectrum'].isnull().sum()))\n",
+ "print('Testing dataset count {} / NIST {} / complat {}'.format(loaded_test_df.shape[0], loaded_test_df['spectrum'].isnull().sum(), loaded_test_df.shape[0] - loaded_test_df['spectrum'].isnull().sum()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3. Function groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clear_mapnum(mol):\n",
+ " [\n",
+ " atom.ClearProp('molAtomMapNumber')\n",
+ " for atom in mol.GetAtoms()\n",
+ " if atom.HasProp('molAtomMapNumber')\n",
+ " ]\n",
+ "\n",
+ "def extract_fgs(mol):\n",
+ " fg_smas = []\n",
+ " fgs = identify_functional_groups(mol)\n",
+ "\n",
+ " for fg in fgs:\n",
+ " target = fg.type\n",
+ " mol = Chem.MolFromSmarts(target)\n",
+ " clear_mapnum(mol)\n",
+ " sma = Chem.MolToSmarts(mol)\n",
+ " fg_smas.append(sma)\n",
+ "\n",
+ " return list(set(fg_smas))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fg_dict = {}\n",
+ "for idx, csmi in enumerate(loaded_train_df['cano_smi']):\n",
+ " m = Chem.MolFromSmiles(csmi)\n",
+ " fgs = extract_fgs(m)\n",
+ " for fg in fgs:\n",
+ " if fg in fg_dict:\n",
+ " fg_dict[fg] += 1\n",
+ " else:\n",
+ " fg_dict[fg] = 1\n",
+ "\n",
+ "for idx, csmi in enumerate(loaded_valid_df['cano_smi']):\n",
+ " m = Chem.MolFromSmiles(csmi)\n",
+ " fgs = extract_fgs(m)\n",
+ " for fg in fgs:\n",
+ " if fg in fg_dict:\n",
+ " fg_dict[fg] += 1\n",
+ " else:\n",
+ " fg_dict[fg] = 1\n",
+ " \n",
+ "for idx, csmi in enumerate(loaded_test_df['cano_smi']):\n",
+ " m = Chem.MolFromSmiles(csmi)\n",
+ " fgs = extract_fgs(m)\n",
+ " for fg in fgs:\n",
+ " if fg in fg_dict:\n",
+ " fg_dict[fg] += 1\n",
+ " else:\n",
+ " fg_dict[fg] = 1\n",
+ " \n",
+ "fg_count_from_mol = dict(sorted(fg_dict.items(), key=lambda item: item[1], reverse=True))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "col_nms = loaded_train_df.columns[2:52]\n",
+ "fg_count_from_df = {}\n",
+ "for cn in col_nms:\n",
+ " count = loaded_train_df[cn].sum() + loaded_valid_df[cn].sum() + loaded_test_df[cn].sum()\n",
+ " fg_count_from_df[cn] = count"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "functional group SMARTS / count in the dataframe\n",
+ "CO / 765\n",
+ "cOC / 523\n",
+ "COC(-,:C)=O / 502\n",
+ "cnc / 440\n",
+ "cCl / 436\n",
+ "cO / 349\n",
+ "CCl / 318\n",
+ "CC(-,:C)=O / 263\n",
+ "cBr / 230\n",
+ "c[N&+](=O)[O&-] / 245\n",
+ "cC(-,:C)=O / 235\n",
+ "C=CC / 237\n",
+ "cN / 224\n",
+ "cC(=O)OC / 210\n",
+ "COC / 205\n",
+ "CF / 198\n",
+ "CBr / 195\n",
+ "coc / 169\n",
+ "cF / 171\n",
+ "CC(=O)O / 145\n",
+ "c=O / 135\n",
+ "c[n&H1]c / 124\n",
+ "csc / 113\n",
+ "cC=O / 105\n",
+ "CNC / 110\n",
+ "CN / 105\n",
+ "CN(-,:C)C / 105\n",
+ "cC#N / 82\n",
+ "cn(-,:c)C / 93\n",
+ "cC(=O)O / 89\n",
+ "CC=C(-,:C)C / 86\n",
+ "CC#N / 86\n",
+ "cNC(-,:C)=O / 79\n",
+ "cNC / 76\n",
+ "C/C=C/C / 66\n",
+ "CC=CC / 77\n",
+ "C=C(-,:C)C / 71\n",
+ "C#CC / 71\n",
+ "cC(-,:c)=O / 70\n",
+ "cN(-,:C)C / 71\n",
+ "CC(=O)OC / 69\n",
+ "CC#CC / 67\n",
+ "cI / 62\n",
+ "CNC(-,:C)=O / 56\n",
+ "cC=Cc / 54\n",
+ "c-n(-,:c)c / 37\n",
+ "cnn(-,:c)C / 46\n",
+ "cnnc / 32\n",
+ "cP(-,:c)c / 44\n",
+ "CS / 39\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('functional group SMARTS / count in the dataframe')\n",
+ "for cn in col_nms:\n",
+ " print('{} / {}'.format(cn, fg_count_from_df[cn]))\n",
+ " if fg_count_from_df[cn] != fg_count_from_mol[cn]:\n",
+ " print('- - - - - {} has wrong count between dataframe and IFG extraction.'.format(cn))\n",
+ " print(cn)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "50"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(col_nms)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "chem-dl-ir",
+ "language": "python",
+ "name": "chem-dl-ir"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/training_spectrum_to_fgs/INSTALL.md b/training_spectrum_to_fgs/INSTALL.md
index b040cdd..e433e88 100644
--- a/training_spectrum_to_fgs/INSTALL.md
+++ b/training_spectrum_to_fgs/INSTALL.md
@@ -45,22 +45,19 @@ $ pip install numpy pandas matplotlib scikit-learn scipy Flask git+https://githu
$ python -m ipykernel install --user --name=deep-ir-01
```
-
# 2. Training and Validation
1. To get nist data, you should buy it from `https://www.nist.gov/srd/nist-standard-reference-database-35`, and put & rename the folder to `./data/nist/`.

-
2. run following files one-by-one
-
```
+00_explanation_nist_and_in_house_dataset.ipynb
01_load_data.ipynb
02_train_model.ipynb
03_verify_model_and_save_acc.ipynb
04_count_spectra.ipynb
```
-