diff --git a/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb b/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb new file mode 100644 index 0000000..5355bef --- /dev/null +++ b/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb @@ -0,0 +1,1557 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 0. Load python libs" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "import json\n", + "import nmrglue as ng\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pickle as pickle\n", + "import pandas as pd\n", + "import random\n", + "import math\n", + "from sklearn.model_selection import train_test_split\n", + "from collections import Counter, defaultdict, OrderedDict\n", + "\n", + "from scipy import interpolate\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from lib.carrier import SpectraCarrier\n", + "import lib.utils as utils" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020.09.1\n" + ] + } + ], + "source": [ + "import rdkit\n", + "print(rdkit.__version__)\n", + "\n", + "from rdkit import Chem\n", + "from rdkit import RDLogger\n", + "RDLogger.DisableLog('rdApp.*')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from lib.ifg import identify_functional_groups" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = './data'\n", + "SOURCE_DIR = './data/source'\n", + "TARGET_DIR = './data/target'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "fn_train_df = '{}/train_df.pk'.format(SOURCE_DIR)\n", + "fn_valid_df = '{}/valid_df.pk'.format(SOURCE_DIR)\n", + "fn_test_df = '{}/test_df.pk'.format(SOURCE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_train_df = None\n", + "with open(fn_train_df, 'rb') as file:\n", + " loaded_train_df = pickle.load(file)\n", + " \n", + "loaded_valid_df = None\n", + "with open(fn_valid_df, 'rb') as file:\n", + " loaded_valid_df = pickle.load(file)\n", + " \n", + "loaded_test_df = None\n", + "with open(fn_test_df, 'rb') as file:\n", + " loaded_test_df = pickle.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_rows', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fncano_smiCOcOCCOC(-,:C)=OcnccClcOCClCC(-,:C)=OcBrc[N&+](=O)[O&-]cC(-,:C)=OC=CCcNcC(=O)OCCOCCFCBrcoccFCC(=O)Oc=Oc[n&H1]ccsccC=OCNCCNCN(-,:C)CcC#Ncn(-,:c)CcC(=O)OCC=C(-,:C)CCC#NcNC(-,:C)=OcNCC/C=C/CCC=CCC=C(-,:C)CC#CCcC(-,:c)=OcN(-,:C)CCC(=O)OCCC#CCcICNC(-,:C)=OcC=Ccc-n(-,:c)ccnn(-,:c)CcnnccP(-,:c)cCSspectrum(W)-CO(W)-cOC(W)-COC(-,:C)=O(W)-cnc(W)-cCl(W)-cO(W)-CCl(W)-CC(-,:C)=O(W)-cBr(W)-c[N&+](=O)[O&-](W)-cC(-,:C)=O(W)-C=CC(W)-cN(W)-cC(=O)OC(W)-COC(W)-CF(W)-CBr(W)-coc(W)-cF(W)-CC(=O)O(W)-c=O(W)-c[n&H1]c(W)-csc(W)-cC=O(W)-CNC(W)-CN(W)-CN(-,:C)C(W)-cC#N(W)-cn(-,:c)C(W)-cC(=O)O(W)-CC=C(-,:C)C(W)-CC#N(W)-cNC(-,:C)=O(W)-cNC(W)-C/C=C/C(W)-CC=CC(W)-C=C(-,:C)C(W)-C#CC(W)-cC(-,:c)=O(W)-cN(-,:C)C(W)-CC(=O)OC(W)-CC#CC(W)-cI(W)-CNC(-,:C)=O(W)-cC=Cc(W)-c-n(-,:c)c(W)-cnn(-,:c)C(W)-cnnc(W)-cP(-,:c)c(W)-CS
01136-86-3COc1cc(C(C)=O)cc(OC)c1OC01000000001000000000000000000000000000000000000000None0.5678777.2023100.5407320.5329570.536040.5281670.5263220.5215370.5169490.51996115.0593750.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
3610-54-8CCOc1ccc([N+](=O)[O-])cc1[N+](=O)[O-]01000000010000000000000000000000000000000000000000None0.5678777.2023100.5407320.5329570.536040.5281670.5263220.5215370.51694914.3267570.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
5645-36-3CCOC(CN)OCC00000000000000000000000001000000000000000000000000None0.5678770.5413390.5407320.5329570.536040.5281670.5263220.5215370.5169490.5199610.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.50844132.7216050.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
\n", + "
" + ], + "text/plain": [ + " fn cano_smi CO cOC COC(-,:C)=O \\\n", + "0 1136-86-3 COc1cc(C(C)=O)cc(OC)c1OC 0 1 0 \n", + "3 610-54-8 CCOc1ccc([N+](=O)[O-])cc1[N+](=O)[O-] 0 1 0 \n", + "5 645-36-3 CCOC(CN)OCC 0 0 0 \n", + "\n", + " cnc cCl cO CCl CC(-,:C)=O cBr c[N&+](=O)[O&-] cC(-,:C)=O C=CC cN \\\n", + "0 0 0 0 0 0 0 0 1 0 0 \n", + "3 0 0 0 0 0 0 1 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 0 0 \n", + "\n", + " cC(=O)OC COC CF CBr coc cF CC(=O)O c=O c[n&H1]c csc cC=O CNC \\\n", + "0 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "\n", + " CN CN(-,:C)C cC#N cn(-,:c)C cC(=O)O CC=C(-,:C)C CC#N cNC(-,:C)=O \\\n", + "0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 \n", + "5 1 0 0 0 0 0 0 0 \n", + "\n", + " cNC C/C=C/C CC=CC C=C(-,:C)C C#CC cC(-,:c)=O cN(-,:C)C CC(=O)OC \\\n", + "0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 \n", + "\n", + " CC#CC cI CNC(-,:C)=O cC=Cc c-n(-,:c)c cnn(-,:c)C cnnc cP(-,:c)c CS \\\n", + "0 0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 0 \n", + "\n", + " spectrum (W)-CO (W)-cOC (W)-COC(-,:C)=O (W)-cnc (W)-cCl (W)-cO \\\n", + "0 None 0.567877 7.202310 0.540732 0.532957 0.53604 0.528167 \n", + "3 None 0.567877 7.202310 0.540732 0.532957 0.53604 0.528167 \n", + "5 None 0.567877 0.541339 0.540732 0.532957 0.53604 0.528167 \n", + "\n", + " (W)-CCl (W)-CC(-,:C)=O (W)-cBr (W)-c[N&+](=O)[O&-] (W)-cC(-,:C)=O \\\n", + "0 0.526322 0.521537 0.516949 0.519961 15.059375 \n", + "3 0.526322 0.521537 0.516949 14.326757 0.518953 \n", + "5 0.526322 0.521537 0.516949 0.519961 0.518953 \n", + "\n", + " (W)-C=CC (W)-cN (W)-cC(=O)OC (W)-COC (W)-CF (W)-CBr (W)-coc \\\n", + "0 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "3 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "5 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "\n", + " (W)-cF (W)-CC(=O)O (W)-c=O (W)-c[n&H1]c (W)-csc (W)-cC=O \\\n", + "0 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "3 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "5 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "\n", + " (W)-CNC (W)-CN (W)-CN(-,:C)C (W)-cC#N (W)-cn(-,:c)C (W)-cC(=O)O \\\n", + "0 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "3 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "5 0.508441 32.721605 0.508441 0.505985 0.507156 0.50705 \n", + "\n", + " (W)-CC=C(-,:C)C (W)-CC#N (W)-cNC(-,:C)=O (W)-cNC (W)-C/C=C/C \\\n", + "0 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "3 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "5 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "\n", + " (W)-CC=CC (W)-C=C(-,:C)C (W)-C#CC (W)-cC(-,:c)=O (W)-cN(-,:C)C \\\n", + "0 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "3 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "5 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "\n", + " (W)-CC(=O)OC (W)-CC#CC (W)-cI (W)-CNC(-,:C)=O (W)-cC=Cc \\\n", + "0 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "3 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "5 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "\n", + " (W)-c-n(-,:c)c (W)-cnn(-,:c)C (W)-cnnc (W)-cP(-,:c)c (W)-CS \n", + "0 0.501979 0.503132 0.502398 0.502922 0.502398 \n", + "3 0.501979 0.503132 0.502398 0.502922 0.502398 \n", + "5 0.501979 0.503132 0.502398 0.502922 0.502398 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loaded_train_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fncano_smiCOcOCCOC(-,:C)=OcnccClcOCClCC(-,:C)=OcBrc[N&+](=O)[O&-]cC(-,:C)=OC=CCcNcC(=O)OCCOCCFCBrcoccFCC(=O)Oc=Oc[n&H1]ccsccC=OCNCCNCN(-,:C)CcC#Ncn(-,:c)CcC(=O)OCC=C(-,:C)CCC#NcNC(-,:C)=OcNCC/C=C/CCC=CCC=C(-,:C)CC#CCcC(-,:c)=OcN(-,:C)CCC(=O)OCCC#CCcICNC(-,:C)=OcC=Ccc-n(-,:c)ccnn(-,:c)CcnnccP(-,:c)cCSspectrum(W)-CO(W)-cOC(W)-COC(-,:C)=O(W)-cnc(W)-cCl(W)-cO(W)-CCl(W)-CC(-,:C)=O(W)-cBr(W)-c[N&+](=O)[O&-](W)-cC(-,:C)=O(W)-C=CC(W)-cN(W)-cC(=O)OC(W)-COC(W)-CF(W)-CBr(W)-coc(W)-cF(W)-CC(=O)O(W)-c=O(W)-c[n&H1]c(W)-csc(W)-cC=O(W)-CNC(W)-CN(W)-CN(-,:C)C(W)-cC#N(W)-cn(-,:c)C(W)-cC(=O)O(W)-CC=C(-,:C)C(W)-CC#N(W)-cNC(-,:C)=O(W)-cNC(W)-C/C=C/C(W)-CC=CC(W)-C=C(-,:C)C(W)-C#CC(W)-cC(-,:c)=O(W)-cN(-,:C)C(W)-CC(=O)OC(W)-CC#CC(W)-cI(W)-CNC(-,:C)=O(W)-cC=Cc(W)-c-n(-,:c)c(W)-cnn(-,:c)C(W)-cnnc(W)-cP(-,:c)c(W)-CS
6288a5e81bc8-7cd1-4603-bf77-d6635351f25bCC1=CC[C@@H](C2(C)CC2(C)C(=O)OC(C)(C)C)CC100100000000000000000000000000010000000000000000000[0.9202703349311558, 0.9234971350037704, 0.926...0.5678770.5413397.3015150.5329570.536040.5281670.5263220.5215370.5169490.5199610.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.5070540.7761540.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
6289427ee4cc-e725-4b40-829d-51b14984e029Brc1cnc2ccccc2n100010000100000000000000000000000000000000000000000[0.6751178997222926, 0.6719449948267988, 0.668...0.5678770.5413390.5407328.8941280.536040.5281670.5263220.52153716.7750000.5199610.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
6290dcf80c78-98e2-4ef9-b8a8-bc259aa8f9a4CCCC[Sn](CCCC)(CCCC)c1cnc2ccccc2n100010000000000000000000000000000000000000000000000[0.5719682556458464, 0.5722615937762792, 0.571...0.5678770.5413390.5407328.8941280.536040.5281670.5263220.5215370.5169490.5199610.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
\n", + "
" + ], + "text/plain": [ + " fn \\\n", + "6288 a5e81bc8-7cd1-4603-bf77-d6635351f25b \n", + "6289 427ee4cc-e725-4b40-829d-51b14984e029 \n", + "6290 dcf80c78-98e2-4ef9-b8a8-bc259aa8f9a4 \n", + "\n", + " cano_smi CO cOC COC(-,:C)=O cnc \\\n", + "6288 CC1=CC[C@@H](C2(C)CC2(C)C(=O)OC(C)(C)C)CC1 0 0 1 0 \n", + "6289 Brc1cnc2ccccc2n1 0 0 0 1 \n", + "6290 CCCC[Sn](CCCC)(CCCC)c1cnc2ccccc2n1 0 0 0 1 \n", + "\n", + " cCl cO CCl CC(-,:C)=O cBr c[N&+](=O)[O&-] cC(-,:C)=O C=CC cN \\\n", + "6288 0 0 0 0 0 0 0 0 0 \n", + "6289 0 0 0 0 1 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 0 \n", + "\n", + " cC(=O)OC COC CF CBr coc cF CC(=O)O c=O c[n&H1]c csc cC=O CNC \\\n", + "6288 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "6289 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "\n", + " CN CN(-,:C)C cC#N cn(-,:c)C cC(=O)O CC=C(-,:C)C CC#N cNC(-,:C)=O \\\n", + "6288 0 0 0 0 0 1 0 0 \n", + "6289 0 0 0 0 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 \n", + "\n", + " cNC C/C=C/C CC=CC C=C(-,:C)C C#CC cC(-,:c)=O cN(-,:C)C CC(=O)OC \\\n", + "6288 0 0 0 0 0 0 0 0 \n", + "6289 0 0 0 0 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 \n", + "\n", + " CC#CC cI CNC(-,:C)=O cC=Cc c-n(-,:c)c cnn(-,:c)C cnnc cP(-,:c)c \\\n", + "6288 0 0 0 0 0 0 0 0 \n", + "6289 0 0 0 0 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 \n", + "\n", + " CS spectrum (W)-CO \\\n", + "6288 0 [0.9202703349311558, 0.9234971350037704, 0.926... 0.567877 \n", + "6289 0 [0.6751178997222926, 0.6719449948267988, 0.668... 0.567877 \n", + "6290 0 [0.5719682556458464, 0.5722615937762792, 0.571... 0.567877 \n", + "\n", + " (W)-cOC (W)-COC(-,:C)=O (W)-cnc (W)-cCl (W)-cO (W)-CCl \\\n", + "6288 0.541339 7.301515 0.532957 0.53604 0.528167 0.526322 \n", + "6289 0.541339 0.540732 8.894128 0.53604 0.528167 0.526322 \n", + "6290 0.541339 0.540732 8.894128 0.53604 0.528167 0.526322 \n", + "\n", + " (W)-CC(-,:C)=O (W)-cBr (W)-c[N&+](=O)[O&-] (W)-cC(-,:C)=O \\\n", + "6288 0.521537 0.516949 0.519961 0.518953 \n", + "6289 0.521537 16.775000 0.519961 0.518953 \n", + "6290 0.521537 0.516949 0.519961 0.518953 \n", + "\n", + " (W)-C=CC (W)-cN (W)-cC(=O)OC (W)-COC (W)-CF (W)-CBr (W)-coc \\\n", + "6288 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "6289 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "6290 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "\n", + " (W)-cF (W)-CC(=O)O (W)-c=O (W)-c[n&H1]c (W)-csc (W)-cC=O \\\n", + "6288 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "6289 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "6290 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "\n", + " (W)-CNC (W)-CN (W)-CN(-,:C)C (W)-cC#N (W)-cn(-,:c)C (W)-cC(=O)O \\\n", + "6288 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "6289 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "6290 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "\n", + " (W)-CC=C(-,:C)C (W)-CC#N (W)-cNC(-,:C)=O (W)-cNC (W)-C/C=C/C \\\n", + "6288 40.776154 0.50673 0.505772 0.505879 0.505348 \n", + "6289 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "6290 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "\n", + " (W)-CC=CC (W)-C=C(-,:C)C (W)-C#CC (W)-cC(-,:c)=O (W)-cN(-,:C)C \\\n", + "6288 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "6289 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "6290 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "\n", + " (W)-CC(=O)OC (W)-CC#CC (W)-cI (W)-CNC(-,:C)=O (W)-cC=Cc \\\n", + "6288 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "6289 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "6290 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "\n", + " (W)-c-n(-,:c)c (W)-cnn(-,:c)C (W)-cnnc (W)-cP(-,:c)c (W)-CS \n", + "6288 0.501979 0.503132 0.502398 0.502922 0.502398 \n", + "6289 0.501979 0.503132 0.502398 0.502922 0.502398 \n", + "6290 0.501979 0.503132 0.502398 0.502922 0.502398 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loaded_train_df.tail(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXxkZZno8d9TlX1PutNbet9paKCb0OyyDnSDIzKDjjjCDOogCuq9VwfxOpt3NscZvTNeUS7ei6h3hHFwoRWQTUGFhibQ+77Qna07Saez71X13j/OklOVSlKVVJJTxfP9fPrTVadOVd6cJE899bybGGNQSimVuQIz3QCllFJTSwO9UkplOA30SimV4TTQK6VUhtNAr5RSGS5rpr7w7NmzzdKlS2fqyyulVFp66623zhhjKpN5zowF+qVLl1JTUzNTX14ppdKSiJxM9jlaulFKqQyngV4ppTKcBnqllMpwGuiVUirDaaBXSqkMN26gF5FHRaRZRPaO8riIyDdE5KiI7BaRjalvplJKqYlKJKN/DNg8xuNbgFX2v3uAb0++WUoppVJl3EBvjPkNcHaMU24Fvm8srwNlIjI/VQ1USim/MMbwo5o62nsHZ7opSUlFjb4KqPPcr7ePjSAi94hIjYjUtLS0pOBLK6XU9NnX2MkDT+7ma88fnummJCUVgV7iHIu7m4kx5hFjTLUxprqyMqkZvEopNeMa2vsA2NPQMcMtSU4qAn09sMhzfyHQmILXVUopXznd0Q9AblZ6DVhMRWu3AnfZo28uBTqMMadS8LpKKeUrTZ1WoM/PCc5wS5Iz7qJmIvI4cA0wW0Tqgb8GsgGMMQ8DzwA3A0eBXuDuqWqsUkrNpL6hMABD4cgMtyQ54wZ6Y8wd4zxugPtS1iKllPKpgZAV4AdD6RXo06vQpJRSM8gJ8AMa6JVSKjM5AX5gSAO9UkplpMGQVaMfzLQavVJKvdt96ad7WF9V6mb0oYgGeqWUyhjdAyH+/Y1aAC5fMQuANIvzWrpRSqmxvNPS4952MvqIiTv537c00Cul1BiOtXS7twc10CulVOZxAn1hTpABuzM2kl5xXgO9UkqNxQn0oYgZLt2kWaTXQK+UUmM41mzV6AdCEXf8vJZulFIqQ4TCEd5pHe6M7R0MARDWjF4ppTLD4aZuBkMR1s4rBqBn0KrRp1lCr4FeKaVGs6OuDYCLl1YAw5l8OM0ivQZ6pZQaxY7admYV5rC8sjDquNbolVIZ7WRrD3c88jrN9iYcmezt2jY2LC4jKxgdKnVmrFIqo/1g20m2HW9l667M3jH0TPcAx1t6qF5aQXZgeGvs/OygZvRKqcx2pntgppswLWpOnAXg4qXlBL2BPkcDvVIqwzV3WYG+eyA0wy0ZXVf/EMu/+DTP7pn49tXbjrWSnx1kfVUZ2Z7SjZXRg0mjYK+BXimVMGMMh5usmaK99lBDP2po7yNi4GsvHJ7Q840xvHK4hUuWV5CTFSArOJzR52UH7HNS0tRpoYFeKZWwurN9bunGmTzkR87iY56KS1KOtfRworWX69fOASAr4Mnoc4JAeg2x1ECvlErY27Vt7u3eAf9m9D122wIysUj/0oEmAK4/Zy4AWTGdsZBeQyx14xGlVMLeOtlGUW4W80rzfF266bH7D/LsoJys14+3smpOEQvK8gFiSjd2oE+jIZaa0SulEvbWyTYuXFRGUW4WPT4u3Thty59AoDfGsKehg/MXlrnHYjtjIb0yeg30SqmE9AyEOHi6k42LyyjMDdLn64zeaptTT0/G6c5+znQPcv7CUvdYVszwStAavVIqA+2qaydiYOOScvKzs9wFvvzI6Sh2RsgkY099BwDnVXkCfZyM3mjpRimVaZyO2A2Ly+2M3r+lG2eMf15W8hn93oYOAgLr5pe4x3I8gT5PSzdKqUy17Xgra+YWU5qfTUGOfzP6/qEwbT2D1p0JDLo51tLDklmFUWUfb2eslm6UUhmpZyDEm++0cfWaSsDaP7Wla8Atc/jJ5V/5Fd/bdhKY2JZ/jR19LCjLizrm7Yx1PiVoRq+UyihP7WxkMBzhpnOtceWFudbI7N//5u8m9bqN7X189okddPUPTbqNYAX2s042T2KbeA+EwkQihqPN3W6bFpTmR53jLd3k5wTsr5WCBk8THUevlBrXL3Y3sqKykI2Ly4GJj0+P9fArx3hqZyNXrJjNBy9eNOnXO9MTveDaeOUVYwy3f3sbexqsTyY/u+8KmrsGmF8WHeijSjdao1dKZZrW7gHeeOcsW86bj9gzTfuHUlOfb2zvA2CCE1hHaOqIDvTjlW5ePtziBnmA3x5uwRiYXzp66SbXLt2k076xCQV6EdksIodE5KiIPBjn8VIR+bmI7BKRfSJyd+qbqpSaCd999QThiGHL+nnuscLc1GT0De3W5iUDocnVQYwx/PpQM3VtvVHHx8q6+wbDfO35Q1HHTrRazx8Z6IffiQL2mPo0SujHD/QiEgQeArYA64A7RGRdzGn3AfuNMRcA1wBfE5GcFLdVKTXNjDE8vr2W69bO4dwFw+PK/6h6MQDFeZOr/p7qsDL6yX5C+MXuU9z93Tf5x2cPRB0Pj/H+8YPXT7C3oZMrV852j/347XoAFlcURJ3rzeiduVOG9In0iWT0m4CjxpjjxphB4Ang1phzDFAs1ue6IuAs4N9BtkqphBw/00Nrz6DbCesoLcjmjy9ZTG7WxKu/g6EI7b1WJ+xkM3pnt6u6s31Rx8fK6F8+1MLaecXcsWlx1PEVlYUsmx29R6w30DtlpjSq3CQU6KuAOs/9evuY1zeBc4BGYA/wWWNGzhsTkXtEpEZEalpaWibY5OkXiRjae4d78o82d/Nn36/hl3tPA/Dq0TNsO9aaVhsRqHePjr4hmjr72XasNannGWPY/o6zy1LFiMezgwF3OeCJqD07XGaZTEbf2j3Arw82u/ffe/58PnnNCkrzs0cN9G09g9ScaOOqVbMpL8iOeuyGdXPdvghHVOlGnNJN+vy9J/K5K143Sex3eBOwE7gOWAG8ICK/NcZ0Rj3JmEeARwCqq6t9eZX6h8JkBYQn36qnpWuACxeX8b3XTvDigWZe/G9X8/6HXnVn3b2wvynquSsqC3n6M1elbESCUhPRNxgmOyhkBQNs3dXIf3lih5t9/ugTl7Fp2XDQjkSMW3P2+uXe0zzw5C6WzCpkdlHuiAwXICcrwFB44n/G75zpcW9PJtBv3dVIyJNef+I9K1i/sJRtx1pH7TD9/raTDEUifLB60YjvoShnZFiMDfyQXhl9IoG+HvCOe1qIlbl73Q18xVhvcUdF5B1gLbA9Ja2cAsYYRIRIxFDX1svOuna++stDNLT3jfqcG77+ypiveaylh//9ynE+e8OqVDdX+djB0538yaPb+dYfb+SiJSMzX8fehg5+ufc0d162hLkleaOel4hwxPC3v9jPjto2bttQxdZdjTR3DdAzEKKtd4jsoLCgLJ+TdufihYvK2FnXzgNP7mLrp6+kJC+b7oEQ13/tZe66bCn3Xbsy6vW/+txBOvtD7Gno4Ob18+IGuuygMDRWEXwcJ+xAHxDom0Sg/8+aes6rKuG95y/gK88e5Jz5xQAEAzJqRv/SwSYuWlzOqrnFnO7oj3qsIHfssDi8xn36RPpEAv2bwCoRWQY0AB8CPhxzTi1wPfBbEZkLrAGOp7Kh4znTPcDpjn7WzivmH545yOq5RZxXVcrtD79G/5D1y/iz+66grWeQH9XUsbu+gzs2LeJfnp/YVmPfuauadQtKiEQMc0py+d5rJ3h272n+54uHuXhZOZevmD3+i6i019U/xCuHWmjqHODPn9zN+VWlzC3J44HNaxFgMBxh/6lO7vv3tzllB5Rv/vooJ75yC2AFu8FwhNVzi8f8OgOhMLlZQerO9vLV5w7x813DudYue3bqoop82nqHWD67kLkleeyoa+PipeU8cmc15YU5/OpgEx99rIZf7DrFhy9ZzN/9Yj9NnQP883OHuHR5hfsmdbZnMCrbjle2Aat0E4oYN2lKxMOvHGP13CKuWzuX42d6qCjMITcr4P6NJuJ4Szf/61dH+cc/WE/d2V72n+rkf9x6LnddtpR7r17hnhcUiZvRt3YPsLu+g8/fuBqAspjSTeE4K16mY41+3EBvjAmJyP3Ac0AQeNQYs09E7rUffxj4W+AxEdmDVer5gjHmzFQ0OBSO8OaJNh7fXut2wMwqzKHVMxtuNO9/6NWo+94g/9ErlvHoq++4969aNZvzqkq579qVhCOGp3Y28NVfHuJfPnABC8vzo1a2A7jnPSu4fMVs3vu/fse3Xz6mgT6DvX68lYdfOcYN58zlL3621z1+vKWH4y1WgPzfvxk7z1n64NP824cu5LNP7ATgnX+8mWMt3ZQV5NDcOUBhbpAfvlHLjrp2tr9zlmBA+OKWtfzDMwfcAHPJsgr+38cv4YX9TcwvzWPD4nL6h8Kjlg6vXTOHxRUF/KimjoDAE2/WsbA8n/q2PrbubHQD/cuHmqOGDlaP8inF6aAcChtyssYP9LWtvXzl2YMAnPjKLdSd7WVRRQHtvYOEkvhkcMs3fkffUJiPXLqYd85Yn1ji/b2JxA/GB093AURN/vru3Rdz93ffBEbP6L9xxwYKsoMM2m1NoxJ9YjNjjTHPAM/EHHvYc7sRuDG1TYvvJzsaeODJ3QQ9dcXWnkHKCrLdHvwrV87md0et95nPXL+Kb7x0ZMTrbFxcxr7GTr50yznceekSRIT7r1vJD7ad5FPXrojqZQe467Kl7nmjOa+qlM9ev4p/e+kILV0DVBbnpuJbVlPguX2nOWdeCYtnFYx6Tjhi2NfYQTAg7Kxr58ObFrOrvoMPPfI6YI3aiGfD4jJ21LYDsKA0j0ZPaeCT16zg2y8fA3CDPMCyL0b9eZGTFd3RGY4Y/u7pA5QVZPOdu6pZUlFAaUE22cEAN6+f7543Vv+QiHDnpUv4+2cO0NxptWnr/Vfyse+9ydGWbve87e+cjfp7ckohsZwOyqFwhJwERt88v/+0e7tvMEx9Wy/nVZXSMxAas9bf0N7HkaYurlkzh8b2PrfM09I1wP7GTvKyA3H7EAIihOKsU3C4yQr0K+cWuceuXTPHvV0wyjV83wULAHh2zykgvWbGpt0SCNeumcNDH97IdWvnuGtNO8G3Z8CqKV6yrILfHT3DZctnkRUM8OnrVtLY3kdFYQ5NnQOsnGP9gMMRE/WGUVGYM2Z9PZGPpzeeO5d/e+kIj732Dn9+09rJfKsqAcYY3q5tY3ZRLosrCqJ+RkeaujjW0s2CsnxCEcPehg66+kPUnDjLr+0gveW8ecwvzaesIJvbL1rIl366h0Onu8jPCXKspSfqaz35Vr0bwB03nTuX5/Y1sWZuMZ+7cTXLZhfS2NHPvT94i//4xKXuLkX9Q1ZgW1FZxK0XLmBgKMJt33p19I//Bu6/diV1bb08sHktdWd7eeQ3x/nUNSuoHqWUkohLl88CoLGjn/uvXUlFYQ4LyvLZ3zg8buLg6S7WzivmTy9fRkN7X9Ra7F7DGX1i2bh38MKhpi4a2/vZfN58jjZ3u1lyPB98eBsN7X289Rc38OaJs+7x5/c18ZMdDZy/sDTq79gRDAjxFtg80txNWUE2lUXxE7GCcSaDiTvqZszTfCXtAn1lcS63nD8/7mOFuVnuL/JVqyrd49nBAEtmWe/4xXnD9bh4vxyTde6CUq5eXcmPaur5rzesHvWPRCUvHDF857fHOb+qlJbuAXbUtvPYayfcx7ODwrzSPApzsjDGygSdEVKxnIz12b3DWebXXxi7v2ZHbTvFeVnceekSHtg8/Cb+1smzLCwvcDtYV80tZt+Xb4oazZKXHWTlHCszXjvPWuf8yN/fzN6GDm61S4qP/mk1pfk5rLfLgt4suaos3/3dnoy1nuz8hnXW2Pj5JXm8dKCJUDhCQITDTV18sHoRm8+bN9rLAMOBfqwg7WjvHaTmZBu3rJ/P03tO8erRMwyGI1SV55OTFRi1dHOqo88dILG7oSMq0P9kRwPAiAXIHFbpZmQ0PtXex8Ly/FETt4I4o25iXxc0o3/X+8ilS/iz79fw7N7T/L79cc8vTnf0U5gbjHrD87PG9j4+8/gOsoMBth2PPw78kmUVLK8s4oX9p6MmzGxaVsF7Vs3mwKku5pbksWx2ATedO4+wMcwvzec/a+p4bl8TZQXZnDjTQ83JNu66bAl3X7GMeSV5hCIR9zq9dfIszZ0DbFk/MsmIN9Im3pDFWMGAcMGiMq5bO4f23kGuWzt33OdMVnYwwG0bqjja3M359hvKrKJc+ocirPzSs3z4ksX0DoZZMadonFcaXtExkSGWLx9qIRwxfPTKpbx4oImXDljZ/cLyfLICMuprPLF9eArP3d99k42Ly1g5p8hdaRLgS7ecE/e5AXtUXay23iHKC0afuF80TkYfSNXCPNNIA/0UuH7tHOaW5PLzXY2+CfR7Gzr4p18e5LdHzrBkVgEv/rerR/RDpJIxBmMSC3hd/UP8qKaekrwsOvtDtPUMcqipi4a2PurbeunsD5GbFaA4N4vV84opycvi9osW0dU/xPs3VLl16X/8g/UYY+gfiiS0V+gHqhfxgWpr5HA4Yk0OunhpuedT2PBrjDVscrK+c1f1tGaHX//gBUQ8P5tZRcNB74dv1AJWlj+ebLsDdiiBSVNbdzUyrySPDYvKmVuSx/5TVqloTnGuNfEqTkbfPRDie9tOcP3aObxkT4h6u7adq1dXuoH+z29aw6KK+P0s1vDKkcfbegdHLHHgVTjO8ErnN1oz+ne5QEDYct58fri9lu6BEEXj/OJMpRf3N/G1Fw5zwP7DKivI5mRrL68da+Xnuxr5/I1rmFc68THdbT2D/HRHA6vnFnP5ilk88WYdP9vZwFsn2zDGsHJOETevn08kYthV38GRpi7ycoJUleVztmeQgVAkKjuLtd4e+TReGcEhIhPaEDoYEC5bMfnSyEQEA0JwIlshTZCI4JnoyeyikdntnJLxBxJkBaw3xHgdnl6t3QO8criFj1+1jEBAKCvIdmfFluRlk5MVoCdOie2J7bW09w7xmetX8f4NVXz68R1W2zyDHErzR/9kGvCUbo63dPP3Tx+gOC+Lk629XLO6ctTnjVe6sb9trdEraxr2Y6+d4KUDTdx6YeyKEVNjKBwhOxigtXuAf3vpCDtq290lWP+oehGfv2kNDe19vP+hV/n684fYVd9BdjDA525czWOvnuBT164Y95f8bM8g//riYYpys9hd38Hexg53dIZj9dwiPli9iJK8LLbuauRfXzyCiFVLvXBxGUebuznR2sPSWYV09g3xBxuq+ED1IiqLc8gJBokYw9LZhaPO2lSpNatwZFBPZEKXW6MPjR3xnt5zinDEcNsG6+/AG5xL8rNHLd381O5ovWBRGSWe58zydKKWjBHoxTOO/j/erHM/FQCUxSnd3LFpEY9vrxt/HL39pjxeRn+0uZslswqm9JNzojTQT5ENi8spyAmyo7Z9ygK9MYbewTDbjrXys50NPL+viUF7qNtgKMKC0jw+fuUyPnnNCvePw1lxz5lks/9UJ99/7QTf/PVRFlXk80cXRy/wVN/WS21rL5etmMXR5m7uenS7O/EHoCg3i2/98UaaOvt5evcpqpdW8IXNa9yOrgc2r+WN462snlfM7FFGOYxGg/z0mBWT0YtYc1PG44ydH2/Uzc92NLBmbrHbCV3oSSaKcrPIDgZGvMapjj5r+PPNVv29xLNKprdtY62eGRRxs+7YJRYq4nx/X7z5HD7xnhXjDqBwSvRjhfkTZ3q44euvcO/VK3hwy8yPvtNAP0WCAWHtvGK3ZJJqbT2DfPrxHe58Aa+cYICPXrGMz924ekQ2MTsmezt8uosVldaIpMb24QBujGF3fQd3Pbqdjr4hygqszMsYePLeyzje0sON586lKDfL/cO4+4plI9oSDAiXr9TJY34Wm9EXe36mY0lkeGVtay9v17bzwOY17jFn+GJxbhbBgJCdNTLQ76qzhrFWL7UmNXkHD3iDdMkYgwoCgeGsu6kzekOS8jiBviQve8zXc0gCi5q9XdsGwLN7T2mgz3TrFpTw1M7GpKaIJ+K3R1q48/9aywitqCzkz65azm0bq8gJBsb9Ot4s+cZ1c3l+fxNvHLeGrDmbNhhj+MKPd/OjGmtt7jVziznU1EX1knL+8r3ruGBR2aTGcit/ie3TGK8z0pHI8Mqf77Zmr7/PMyjByeidbDw7TulmZ10H2UHhnPnWpwDvUNOKIm+gH72tIsNr3dS3R29Ikj+JhQfd9ejHSOmdkul4pdDp4o9WZKhz5pfw/16vpb6tb9SRAYlo6xnEYGUyP3j9JH/1lDXt3lnfI1nFeVl09Yf4YPUint/f5I5TrreHJn7llwfdIP8f91zKJctn0dDex7ySvCmZe6D8pSDBzuzhmbGjR7xXDrWwvqqUheXDv//O6ztZerzSze76ds6ZXxJ3pq+3dDNWjT4g1qiburO97G/sZNOyCjYtraClayBqs5FkDdfoRz/HWbCtb9Af23JooJ9C6+xsZF9j54QC/e76dt73TWsyTXFuFhcuLuO3R85w+YpZfPX286P+eJLx009dztmeIRaUDXe4bVhcRl1bL0eauvjOb45z+0UL+eofnu9+Aqgqiz8pRWWeREeJuaWbUYZX9gyEeLu2jY9ftTzquJPlOiWc7JjljgdDEXbWtfOHGxfGfd2KhGv0VunmSHMXEQNf2LwmJcNkhzP60SO9M6qoN97U3Bkw893BGWztvBICAvsbO8Y/OcbTu09x27deA6wgn5cTZNuxVj593Uoe/dOLJxzkAVbOKWbTsgqqyvKt0Tg3ruaqVZWc6ujnvh++TWFuFv/95nO0M/Rd5OGPbHRvJ1pucAK9M7zy4OnOqMx8+4mzhCJmRPbs7DebZf9+WaWb4efd98O36R0Mc82a+EMgvX0KY5VgAnbpprPPyqrHmiSVlHFWr7SWPrc+HYfinDTehuVTQTP6KZSfE2TVnOKoXebHc7ZnkM//5y5+dbCZtfOK+Z9/dCEr5xRhjDWBJN5ogYkSEf7p9vMB+M8aawbi4aZu/vbWc1P6dZT/bT5vPrdeuICndjYmvPG3E+gHQhH2N3Zy8zd+ywOb1/Cpa6y17d84fpbsoLgdqg7njcQpA3pLNz0DIX51sJkt583jurVziCc/J8gPP34JB093jdknZe03AZ391vDfscbcJ8PdYWqUcTct3QMMhiJx1+uPRAzve+h33LZhIR+7cuTghamiGf0UW7+wlN31HQlvO/Yvzx/iVwebWVCax3fuquac+SVkBwPkZAWmNPjOscdNX7umkjsnUPdX6c9Z0iDRjN45PxQ2NHVZI7a82xXurGuLW2d33kjcQJ8VIGSXbg6e7iIcMfzhxoUjgvgVK4cntF2+cjYfHSdQBu1RN519VqBP1bIf7rYjo/xJO2Wb5bOL3O/L8auDzext6Iw7SW0qaUY/xS5YVMaTb9WP2yHb2T/EX/x0L1t3NXLnpUv48vvOndbSyeUrZvG531s9oc5dlRmckS0Jj7qxx9EPhiPukspOBhuOGPbUd/CHF42sszvllqA9xTQ7IAyGIxhjaOmyhkHGm639g49eQjiJ6ahO6aajb4j87GBCSykn9LqBsVevrLV39Vo2u5DjZ6JnfT/ym+NUleVHLS09HTSjn2IbFlnL1O6sax/zvCdr6tm6q5GsgLUu/nTXx7ODAT59/SpKC9JjsTOVem6gT3jUzfA4+q7+UNSxYy3d9AyGucBeptnLKVHnBCXqOY+9dsLdjDzeXg6BgCQ1y9SaGQudfSFK8lOX0wbcGn38SF97thcRWDq7kKGwcT/NH2/pZvuJs9x12ZJpny2rGf0UWzOvmNysADvr2kdd4GwoHOFHNXWsnVfMjz95ecIZlVKp5JZuEvz9c94YBkMRwhGrPJKbZb1JOOsXrZk3ctMSZ1kCZ6Z0tv06X/75fvecVJQpgwFrZExn/1BCE6ESN/YSCHVne5lXkucOIw1HDFlB4Xl7Pf73b5ieJVG8NKOfYtnBABcsLOOtk21xH49EDJ95fAcHT3fxiauXa5BXM8YJwIlm9N5lip1hhM7Y+uP2jlXLK0fu/HTjuXP51DUr+KK9vEFWzKfXcnvnrMlyR930D6WsI9Z6Xev/0YpIR5q7WVFZRJZ9LZyRNztq21g6q2DSG8NPhAb6aXDxsnL2NHTQ2N434rFf7DnFs3tP8+CWtdy2If64YaWmgzPDdaztCL28pRunRu9kucdaelhQmhe3Yzc7GOCBzWvd4BtbO5+V5JpIo3EmTFmlm9QF+rGWQDDGcLS5m1Vzi8gODF8fYww7atu5cNHIUtZ00EA/DT5YvYigCB//Xg2RiKF7IMSpjj46+oZ46FdHWV5ZyD0xk0qUmm59dlbubNE5nmBACIgVyJxOWGfi0/GWbpZXjr95CTByPaYUjUgRsT4xW6Wb1Nfo41VuWroH6BsKs3RW4XBGHzac6uinuWuADYvLRz5pGmidYBosmVXIX773HP7yqX1s+oeXONMdvcDSX9yik5PUzHPiVjI7KDmbhjhBbzBkZa/HW3q4bWNitejY0k2qMvqgZ9RNSjN6zxII4Yjhrkff4E8uW8qN586jzh5aubiiwF07Kmxn82DNQJ8JGuinyYcvWUJdWx/7GzuZV1qJAFXl+QjCRy5dMtPNU4r7r11Ja/cANyW4yQtYdfonttdxsT0pajAcoWsgRNdAiIXliS2bEVu6mZ2i+SKBgBA2hr6+1Nbo3WWKjaGrf4hXj7ZypKmbG8+d546hX1RR4K4hFYkYdtS2kZsVcJdqnm4a6KdJMCD895vj722plB8snV3Id+/elNRzgkGhvXeIFw9Ym3pEIoYme7+CRDsdnZ2qHBVxNkKZCBHoH7JKSqkcdTO8OfhwqarDnpRV22oF94Xl+e6EsLAx7KhrZ31VacrG8idLa/RKqZQJG+Ou/Z5woA9Gl27yc1ITloKeElRqx9G7c2PdkUrOoVp7aGVedtD9+j0DYfY0dMxYRyxooFdKTULsAl3hiKGp08ro5yUY6LNjAv3quSPH3k+Et68hZQuaEZvRW58YnLp9XVWths4AABKeSURBVFsviyqskpWT0W87dobBUIT3jLFP7VTTQK+UmrDYkSehsKG1x8roY7coHI23dPPXv7+Oq1MUEL19vPFm2k78dYeXQIjN6E939DO/NDrQH2rqAuDcBTNTnwcN9EqpSYidHRoxhu7+ECLRe8OOxVu6ueGcuSnbjc07ki3eUgwT5bxqxBh3iWawOmdPd/Yz316nx/n6dWf7KMgJzuiKsBrolVITFru0eihi6OwPUZSblfCQYW9Gn5vgGP5EOJn3Levnp3T4sjthiuFZrwK09Q4xGIq4C7I5NfrugRCFuVkp3U40WRrolVITFruaZCRi6OoPJTXKxZvRJzorNxFObE/19pfe4ZXOMsQiwumY0UbOPLDu/tCk9qhNBQ30SqkJi10GIBQxdA8MJbwdIeAuFQCT27Q7lpNBpzrQe2v03oy+vXcQGF6QzVmGuXsgTQK9iGwWkUMiclREHhzlnGtEZKeI7BORV1LbTKWUH8V2xobtjH6svVxjeTP6VC7f6wT4lGf09v8RYwg5O0gJtNtj6cvspb7djH4gRH6CC8VNlXF/GiISBB4Cfg+oB94Uka3GmP2ec8qAbwGbjTG1IhJ/DzClVEaJ7Yx1An0y69XEDq9MFSe+xy6xMPnXHS2jtwK9M5Qz4I6jT4+MfhNw1Bhz3BgzCDwB3BpzzoeBnxhjagGMMc2pbaZSyo/idcZ29Q8ltW1f7MzYVHECbarXkRoeR2+itgps77NKN85yC84niVDEzHhGn8gVrgLqPPfr7WNeq4FyEXlZRN4SkbvivZCI3CMiNSJS09LSMrEWK6V8K2Ks1VmLJli6SSUn0Kc6o3c7Y8EdXikidPQOkZcdcDuUvTNzE10RdKok8tOId5ViF+jMAi4CrgfygW0i8rox5nDUk4x5BHgEoLq6OvHNH5VSaSEUjjAQilCcRGfsVGX0ToBJ/aib4fXonQlTAbFKN2X5wyUr79fNy/J5jR4rg1/kub8QaIxzzhljTA/QIyK/AS4ADqOUetcIRwwDoUhSwySnKqMP29l2MMXj173r0TuLmgVEaO8bdDtiITrQp3J+wEQk8tXfBFaJyDIRyQE+BGyNOecp4CoRyRKRAuAS4EBqm6qU8ru+IWvzkmQCW/YUZfTOgJhgit9IYtejB6sfoL03ejlkb99Art8zemNMSETuB54DgsCjxph9InKv/fjDxpgDIvJLYDcQAf6PMWbvVDZcKeU/TudsThLDJKc6o0/9qBvrf8PwEghO6Wbp7AL3PO8niYFQOKVtSFZChTRjzDPAMzHHHo65/8/AP6euaUopv7toSXncje9zfVC6cYY+BlP9icGzeqUz6iZol25K80vd07ylm87+UGrbkCSdGauUmrDH7r6Ya9aMXG0yN4kNNqaudDMchFMpMLwGQtSom97BMIWeTmjvMslDoQgzSQO9UmrCivOyWRNn/fhkAv1U7ZfsBPpUf2IYnhk7/KkhEID+oXBUJ7T367bZyyPMFN1KUCk1KU6JIsfeKByS73z85DUruG5taifUuxn9lM2MNTild2cEjncGrDej/9iVy1PahmRpoFdKTYozXj032xPokxxO+IXNa1PcKk+NPsWlG+8OU86iboN2acYb6L1vMJuT2HB9KmigV0pNiokz0iaZ0s1UmaqM3pkwFTHGXetnwA703hmwqX6DmYyZ/2kopdKaYWRAnelx4zB1NXrv+4YzYcoZPumt0U9RH/OE+KgpSqm0ZGf0WVGBfuZDi9tRmvLSzXBG77yZDLoZvacz1keR3j8tUUqlJSeg5niC+0wv4gXWblcwhROmPKNunIli+ZrRK6UykbP5hjeb9UPp5n0XLgBg07KKlL6udwkEd+MRm/ca+KlGr52xSqlJGbTr1LlRgX7mc8grVs7mxFduSfnrDi9TPFy6cXhHG6W6E3gyZv6noZRKa86aMnlZ3lE3M5/RTxWJU7pxeMtEUzURbCI00CulJmX9wjIAVswpco/N9LK8Uyl6wlR06cbbAZvqvoHJ0NKNUmpSPnLJYqqXlHOkuZsfvlELJLd6ZbqJtwSCw1uuSfVon8nI3J+GUmpaiAjnzC/xbdki1bybg8fW6L1j9rVGr5TKOH7KYKeSd3PwofDoGb2fRt1ooFdKpYSfMtip5O4Zy3BHtMOvn2q0Rq+USgk/dT5ONRG7M3aMGj3A7RctZMsML2gGGuiVUinipwx2qgVErOGV4djhldFFkn/5wAXT2axRaelGKZUSfqpJTzXBqtGPl9H7hQZ6pVRK+DXITYWAyLg1ej/RQK+USol3U6BHRsnop2ij88nSQK+USol3U6APCBC3Ru/Pa6CBXimVEu+mQC9I1Hr0Dr9eAw30SqmUeDd1xgbE3hB8RI3enyHVn61SSqUdJ5v1a/kilUSESJwlEPz6rWugV0qlhBvofdohmUoi1nr03hp9VkDcWbN+o4FeKZUSzoKV74YSjuCsRz9cuvFrfR400CulUiRo16f9HPBSJRCQEUsg+Pn71kCvlEoJJ5PPyuC16B3WzNjoGr2fP8lk/k9EKTUtSguyAaheUj7DLZl61sxYw2BouHTj4zivi5oppVKjND+bn37qclbNLZ7ppkw5ESuj7x8Ku8f8vKhbQhm9iGwWkUMiclREHhzjvItFJCwit6euiUqpdLFhcTlFuZmfP4pYNfoBT0bv541Xxg30IhIEHgK2AOuAO0Rk3Sjn/RPwXKobqZRSfhIQiESsjN6ZN+DjhD6hjH4TcNQYc9wYMwg8Adwa57xPAz8GmlPYPqWU8h1BGApHiBjIzwm6R/0qkUBfBdR57tfbx1wiUgXcBjw81guJyD0iUiMiNS0tLcm2VSmlfCEg0GfX5wvsQJ/uGX285puY+/8KfMEYE45z7vCTjHnEGFNtjKmurKxMtI1KKeUrIuIJ9FafhJ9r9In0mtQDizz3FwKNMedUA0/Y039nAzeLSMgY87OUtFIppXxEBPoGrUCfl+3/jD6RQP8msEpElgENwIeAD3tPMMYsc26LyGPALzTIK6Uylcjw0Mr87IB9zL+RftxAb4wJicj9WKNpgsCjxph9InKv/fiYdXmllMo0gTilGx/H+cQmTBljngGeiTkWN8AbY/508s1SSin/EqB/yBpDn+92xvo30usSCEoplaSAWMMrAXLstX38XKPXQK+UUskSGLLXonfW39eMXimlMkhAxF2L3t0+0L9xXgO9UkolS4CwndHnZGlGr5RSGScg4m4M7mT0WqNXSqkMIjK86YhTo3dLOD7k35YppZRPiYjbGZuT5f8tFDXQK6VUkrwhvdhef9+7f6zfaKBXSqkkeas0JfnWFoqDoTHXdJxRGuiVUipJ4snpS/KsQO+UcvxIA71SSiXJW44vybdKN85MWT/SQK+UUsnyjJnPz7YCvZPZ+1Hm7+KrlFIp5s3oF5bn87nfW837N1SN/oQZpoFeKaWS5B11EwwIn75+1Yy1JRFaulFKqSR5lzvw8/h5hwZ6pZRKkndZGz+vcePQQK+UUkkSzeiVUiqzRdXoNaNXSqnM4y3X+HgtM1caNFEppfzFG9y1dKOUUhnIuwSCdsYqpVQG8sZ2zeiVUioDRY260YxeKaUyjzeJD2hGr5RSmccJ7elQtgEN9EoplTSnAzYdyjaggV4ppZLmxPd0GEMPGuiVUippohm9UkplNie8p0NHLGigV0qppLk1+kwK9CKyWUQOichREXkwzuN/LCK77X+vicgFqW+qUkr5g1OxyZjSjYgEgYeALcA64A4RWRdz2jvA1caY84G/BR5JdUOVUsovnIw+k0o3m4CjxpjjxphB4AngVu8JxpjXjDFt9t3XgYWpbaZSSvlIpmX0QBVQ57lfbx8bzceAZ+M9ICL3iEiNiNS0tLQk3kqllPKRTKzRx/tOTNwTRa7FCvRfiPe4MeYRY0y1Maa6srIy8VYqpZSPDI+6mdFmJCwrgXPqgUWe+wuBxtiTROR84P8AW4wxralpnlJK+U8gA0s3bwKrRGSZiOQAHwK2ek8QkcXAT4A7jTGHU99MpZTyD0mzzthxM3pjTEhE7geeA4LAo8aYfSJyr/34w8BfAbOAb9kXIGSMqZ66Ziul1MxJt+GViZRuMMY8AzwTc+xhz+2PAx9PbdOUUsqfnB2mMqkzVimllIcT39NhG0HQQK+UUklzSzea0SulVGbKxJmxSimlPIY7Y2e2HYnSQK+UUkmSDJwZq5RSysOdGaudsUoplZkyca0bpZRSHjrqRimlMpw76kZLN0oplZk0o1dKqQznLIGgGb1SSmUod5niNImgadJMpZTyDy3dKKVUhtPOWKWUynSa0SulVGZzJ0xpRq+UUplpeHNwDfRKKZWRNKNXSqkM58R3zeiVUipDDS9TPMMNSVCaNFMppfzDyeO1dKOUUhnKqdGLBnqllMpMbo1eA71SSmUmXetGKaUynGjpRimlMluWndKHI2aGW5IYDfRKKZWkoAZ6pZTKbE6gjxgN9EoplZE0o1dKqQznDKtMkzivgV4ppZLllm7SJNInFOhFZLOIHBKRoyLyYJzHRUS+YT++W0Q2pr6pSinlD06gD2VKoBeRIPAQsAVYB9whIutiTtsCrLL/3QN8O8XtVEop3xgu3WRIoAc2AUeNMceNMYPAE8CtMefcCnzfWF4HykRkforbqpRSvlKSlzXTTUhIIq2sAuo89+uBSxI4pwo45T1JRO7ByvhZvHhxsm1VSilf+P0L5nOkqYv7rls5001JSCIZfbw5vrGfVxI5B2PMI8aYamNMdWVlZSLtU0op38nNCvLFm8+hJC97ppuSkEQCfT2wyHN/IdA4gXOUUkrNgEQC/ZvAKhFZJiI5wIeArTHnbAXuskffXAp0GGNOxb6QUkqp6Tdujd4YExKR+4HngCDwqDFmn4jcaz/+MPAMcDNwFOgF7p66JiullEpGQl3GxphnsIK599jDntsGuC+1TVNKKZUKOjNWKaUynAZ6pZTKcBrolVIqw2mgV0qpDCdmhtZqEJEW4OQEnz4bOJPC5kwHbfP00DZPvXRrL2RWm5cYY5KacTpjgX4yRKTGGFM90+1IhrZ5emibp166tRe0zVq6UUqpDKeBXimlMly6BvpHZroBE6Btnh7a5qmXbu2Fd3mb07JGr5RSKnHpmtErpZRKkAZ6pZTKcGkX6MfbqHymiMgJEdkjIjtFpMY+ViEiL4jIEfv/cs/5X7S/h0MictM0tfFREWkWkb2eY0m3UUQusr/Xo/am8PE2npnKNv+NiDTY13qniNzsszYvEpFfi8gBEdknIp+1j/vyWo/RXt9eZxHJE5HtIrLLbvOX7eO+vMbjtHnqr7MxJm3+YS2TfAxYDuQAu4B1M90uu20ngNkxx74KPGjffhD4J/v2OrvtucAy+3sKTkMb3wNsBPZOpo3AduAyrJ3FngW2THOb/wb4fJxz/dLm+cBG+3YxcNhumy+v9Rjt9e11tl+/yL6dDbwBXOrXazxOm6f8OqdbRp/IRuV+civwPfv294D3e44/YYwZMMa8g7WO/6apbowx5jfA2cm0UaxN30uMMduM9Rv3fc9zpqvNo/FLm08ZY962b3cBB7D2UPbltR6jvaOZ8etsLN323Wz7n8Gn13icNo8mZW1Ot0A/2ibkfmCA50XkLbE2QQeYa+ydtuz/59jH/fR9JNvGKvt27PHpdr+I7LZLO87Hc9+1WUSWAhuwsjffX+uY9oKPr7OIBEVkJ9AMvGCM8f01HqXNMMXXOd0CfUKbkM+QK4wxG4EtwH0i8p4xzvXz9+EYrY1+aPu3gRXAhcAp4Gv2cV+1WUSKgB8D/8UY0znWqXGOTXu747TX19fZGBM2xlyItUf1JhE5b4zT/dzmKb/O6RbofbsJuTGm0f6/GfgpVimmyf6Yhf1/s326n76PZNtYb9+OPT5tjDFN9h9MBPgOw2Uv37RZRLKxgua/G2N+Yh/27bWO1950uM52O9uBl4HN+Pgae3nbPB3XOd0CfSIblU87ESkUkWLnNnAjsBerbX9in/YnwFP27a3Ah0QkV0SWAauwOldmQlJttD8Od4nIpXZP/12e50wL5w/ZdhvWtfZNm+2v8X+BA8aYr3se8uW1Hq29fr7OIlIpImX27XzgBuAgPr3GY7V5Wq7zVPQuT+U/rE3ID2P1QH9ppttjt2k5Vu/4LmCf0y5gFvAScMT+v8LznC/Z38MhpnAESEw7H8f6aDiElRV8bCJtBKrtX8ZjwDexZ1hPY5t/AOwBdtt/DPN91uYrsT5K7wZ22v9u9uu1HqO9vr3OwPnADrtte4G/so/78hqP0+Ypv866BIJSSmW4dCvdKKWUSpIGeqWUynAa6JVSKsNpoFdKqQyngV4ppTKcBnqllMpwGuiVUirD/X+RFfLtdUsCWgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(loaded_test_df.iloc[-1:]['spectrum'].values[0][::-1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Dataset composition" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### These datasets contain NIST `basic information` & KIT complat in-house `spectra`.\n", + "\n", + "Since we are not in the position to distribute NIST spectra, saved dataframes in this repository only have: \n", + "\n", + "1. NIST filenames and canonic SMILES without spectra.\n", + "2. KIT complat in-house filenames and canonic SMILES with spectra in the `spectrum` column.\n", + "\n", + "- Therefore, we can know how many NIST and in-house SMILES by checking the `spectrum` column.\n", + "- After you get NIST spectra, NIST spectra will be loaded in the next notebook.\n", + "- To purchase NIST spectra : __TBD__." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training dataset count 4819 / NIST 3930 / complat 889\n", + "Validation dataset count 733 / NIST 557 / complat 176\n", + "Testing dataset count 739 / NIST 579 / complat 160\n" + ] + } + ], + "source": [ + "print('Training dataset count {} / NIST {} / complat {}'.format(loaded_train_df.shape[0], loaded_train_df['spectrum'].isnull().sum(), loaded_train_df.shape[0] - loaded_train_df['spectrum'].isnull().sum()))\n", + "print('Validation dataset count {} / NIST {} / complat {}'.format(loaded_valid_df.shape[0], loaded_valid_df['spectrum'].isnull().sum(), loaded_valid_df.shape[0] - loaded_valid_df['spectrum'].isnull().sum()))\n", + "print('Testing dataset count {} / NIST {} / complat {}'.format(loaded_test_df.shape[0], loaded_test_df['spectrum'].isnull().sum(), loaded_test_df.shape[0] - loaded_test_df['spectrum'].isnull().sum()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Function groups" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def clear_mapnum(mol):\n", + " [\n", + " atom.ClearProp('molAtomMapNumber')\n", + " for atom in mol.GetAtoms()\n", + " if atom.HasProp('molAtomMapNumber')\n", + " ]\n", + "\n", + "def extract_fgs(mol):\n", + " fg_smas = []\n", + " fgs = identify_functional_groups(mol)\n", + "\n", + " for fg in fgs:\n", + " target = fg.type\n", + " mol = Chem.MolFromSmarts(target)\n", + " clear_mapnum(mol)\n", + " sma = Chem.MolToSmarts(mol)\n", + " fg_smas.append(sma)\n", + "\n", + " return list(set(fg_smas))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "fg_dict = {}\n", + "for idx, csmi in enumerate(loaded_train_df['cano_smi']):\n", + " m = Chem.MolFromSmiles(csmi)\n", + " fgs = extract_fgs(m)\n", + " for fg in fgs:\n", + " if fg in fg_dict:\n", + " fg_dict[fg] += 1\n", + " else:\n", + " fg_dict[fg] = 1\n", + "\n", + "for idx, csmi in enumerate(loaded_valid_df['cano_smi']):\n", + " m = Chem.MolFromSmiles(csmi)\n", + " fgs = extract_fgs(m)\n", + " for fg in fgs:\n", + " if fg in fg_dict:\n", + " fg_dict[fg] += 1\n", + " else:\n", + " fg_dict[fg] = 1\n", + " \n", + "for idx, csmi in enumerate(loaded_test_df['cano_smi']):\n", + " m = Chem.MolFromSmiles(csmi)\n", + " fgs = extract_fgs(m)\n", + " for fg in fgs:\n", + " if fg in fg_dict:\n", + " fg_dict[fg] += 1\n", + " else:\n", + " fg_dict[fg] = 1\n", + " \n", + "fg_count_from_mol = dict(sorted(fg_dict.items(), key=lambda item: item[1], reverse=True))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "col_nms = loaded_train_df.columns[2:52]\n", + "fg_count_from_df = {}\n", + "for cn in col_nms:\n", + " count = loaded_train_df[cn].sum() + loaded_valid_df[cn].sum() + loaded_test_df[cn].sum()\n", + " fg_count_from_df[cn] = count" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "functional group SMARTS / count in the dataframe\n", + "CO / 765\n", + "cOC / 523\n", + "COC(-,:C)=O / 502\n", + "cnc / 440\n", + "cCl / 436\n", + "cO / 349\n", + "CCl / 318\n", + "CC(-,:C)=O / 263\n", + "cBr / 230\n", + "c[N&+](=O)[O&-] / 245\n", + "cC(-,:C)=O / 235\n", + "C=CC / 237\n", + "cN / 224\n", + "cC(=O)OC / 210\n", + "COC / 205\n", + "CF / 198\n", + "CBr / 195\n", + "coc / 169\n", + "cF / 171\n", + "CC(=O)O / 145\n", + "c=O / 135\n", + "c[n&H1]c / 124\n", + "csc / 113\n", + "cC=O / 105\n", + "CNC / 110\n", + "CN / 105\n", + "CN(-,:C)C / 105\n", + "cC#N / 82\n", + "cn(-,:c)C / 93\n", + "cC(=O)O / 89\n", + "CC=C(-,:C)C / 86\n", + "CC#N / 86\n", + "cNC(-,:C)=O / 79\n", + "cNC / 76\n", + "C/C=C/C / 66\n", + "CC=CC / 77\n", + "C=C(-,:C)C / 71\n", + "C#CC / 71\n", + "cC(-,:c)=O / 70\n", + "cN(-,:C)C / 71\n", + "CC(=O)OC / 69\n", + "CC#CC / 67\n", + "cI / 62\n", + "CNC(-,:C)=O / 56\n", + "cC=Cc / 54\n", + "c-n(-,:c)c / 37\n", + "cnn(-,:c)C / 46\n", + "cnnc / 32\n", + "cP(-,:c)c / 44\n", + "CS / 39\n" + ] + } + ], + "source": [ + "print('functional group SMARTS / count in the dataframe')\n", + "for cn in col_nms:\n", + " print('{} / {}'.format(cn, fg_count_from_df[cn]))\n", + " if fg_count_from_df[cn] != fg_count_from_mol[cn]:\n", + " print('- - - - - {} has wrong count between dataframe and IFG extraction.'.format(cn))\n", + " print(cn)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(col_nms)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "chem-dl-ir", + "language": "python", + "name": "chem-dl-ir" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/training_spectrum_to_fgs/INSTALL.md b/training_spectrum_to_fgs/INSTALL.md index b040cdd..e433e88 100644 --- a/training_spectrum_to_fgs/INSTALL.md +++ b/training_spectrum_to_fgs/INSTALL.md @@ -45,22 +45,19 @@ $ pip install numpy pandas matplotlib scikit-learn scipy Flask git+https://githu $ python -m ipykernel install --user --name=deep-ir-01 ``` - # 2. Training and Validation 1. To get nist data, you should buy it from `https://www.nist.gov/srd/nist-standard-reference-database-35`, and put & rename the folder to `./data/nist/`. ![nist data](https://github.com/JasonYCHuang/chem-dl-ir/blob/master/training_spectrum_to_fgs/assets/nist.jpg) - 2. run following files one-by-one - ``` +00_explanation_nist_and_in_house_dataset.ipynb 01_load_data.ipynb 02_train_model.ipynb 03_verify_model_and_save_acc.ipynb 04_count_spectra.ipynb ``` -