diff --git a/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb b/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb new file mode 100644 index 0000000..5355bef --- /dev/null +++ b/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb @@ -0,0 +1,1557 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 0. Load python libs" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "import json\n", + "import nmrglue as ng\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pickle as pickle\n", + "import pandas as pd\n", + "import random\n", + "import math\n", + "from sklearn.model_selection import train_test_split\n", + "from collections import Counter, defaultdict, OrderedDict\n", + "\n", + "from scipy import interpolate\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from lib.carrier import SpectraCarrier\n", + "import lib.utils as utils" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020.09.1\n" + ] + } + ], + "source": [ + "import rdkit\n", + "print(rdkit.__version__)\n", + "\n", + "from rdkit import Chem\n", + "from rdkit import RDLogger\n", + "RDLogger.DisableLog('rdApp.*')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from lib.ifg import identify_functional_groups" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = './data'\n", + "SOURCE_DIR = './data/source'\n", + "TARGET_DIR = './data/target'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "fn_train_df = '{}/train_df.pk'.format(SOURCE_DIR)\n", + "fn_valid_df = '{}/valid_df.pk'.format(SOURCE_DIR)\n", + "fn_test_df = '{}/test_df.pk'.format(SOURCE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_train_df = None\n", + "with open(fn_train_df, 'rb') as file:\n", + " loaded_train_df = pickle.load(file)\n", + " \n", + "loaded_valid_df = None\n", + "with open(fn_valid_df, 'rb') as file:\n", + " loaded_valid_df = pickle.load(file)\n", + " \n", + "loaded_test_df = None\n", + "with open(fn_test_df, 'rb') as file:\n", + " loaded_test_df = pickle.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_rows', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fncano_smiCOcOCCOC(-,:C)=OcnccClcOCClCC(-,:C)=OcBrc[N&+](=O)[O&-]cC(-,:C)=OC=CCcNcC(=O)OCCOCCFCBrcoccFCC(=O)Oc=Oc[n&H1]ccsccC=OCNCCNCN(-,:C)CcC#Ncn(-,:c)CcC(=O)OCC=C(-,:C)CCC#NcNC(-,:C)=OcNCC/C=C/CCC=CCC=C(-,:C)CC#CCcC(-,:c)=OcN(-,:C)CCC(=O)OCCC#CCcICNC(-,:C)=OcC=Ccc-n(-,:c)ccnn(-,:c)CcnnccP(-,:c)cCSspectrum(W)-CO(W)-cOC(W)-COC(-,:C)=O(W)-cnc(W)-cCl(W)-cO(W)-CCl(W)-CC(-,:C)=O(W)-cBr(W)-c[N&+](=O)[O&-](W)-cC(-,:C)=O(W)-C=CC(W)-cN(W)-cC(=O)OC(W)-COC(W)-CF(W)-CBr(W)-coc(W)-cF(W)-CC(=O)O(W)-c=O(W)-c[n&H1]c(W)-csc(W)-cC=O(W)-CNC(W)-CN(W)-CN(-,:C)C(W)-cC#N(W)-cn(-,:c)C(W)-cC(=O)O(W)-CC=C(-,:C)C(W)-CC#N(W)-cNC(-,:C)=O(W)-cNC(W)-C/C=C/C(W)-CC=CC(W)-C=C(-,:C)C(W)-C#CC(W)-cC(-,:c)=O(W)-cN(-,:C)C(W)-CC(=O)OC(W)-CC#CC(W)-cI(W)-CNC(-,:C)=O(W)-cC=Cc(W)-c-n(-,:c)c(W)-cnn(-,:c)C(W)-cnnc(W)-cP(-,:c)c(W)-CS
01136-86-3COc1cc(C(C)=O)cc(OC)c1OC01000000001000000000000000000000000000000000000000None0.5678777.2023100.5407320.5329570.536040.5281670.5263220.5215370.5169490.51996115.0593750.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
3610-54-8CCOc1ccc([N+](=O)[O-])cc1[N+](=O)[O-]01000000010000000000000000000000000000000000000000None0.5678777.2023100.5407320.5329570.536040.5281670.5263220.5215370.51694914.3267570.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
5645-36-3CCOC(CN)OCC00000000000000000000000001000000000000000000000000None0.5678770.5413390.5407320.5329570.536040.5281670.5263220.5215370.5169490.5199610.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.50844132.7216050.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
\n", + "
" + ], + "text/plain": [ + " fn cano_smi CO cOC COC(-,:C)=O \\\n", + "0 1136-86-3 COc1cc(C(C)=O)cc(OC)c1OC 0 1 0 \n", + "3 610-54-8 CCOc1ccc([N+](=O)[O-])cc1[N+](=O)[O-] 0 1 0 \n", + "5 645-36-3 CCOC(CN)OCC 0 0 0 \n", + "\n", + " cnc cCl cO CCl CC(-,:C)=O cBr c[N&+](=O)[O&-] cC(-,:C)=O C=CC cN \\\n", + "0 0 0 0 0 0 0 0 1 0 0 \n", + "3 0 0 0 0 0 0 1 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 0 0 \n", + "\n", + " cC(=O)OC COC CF CBr coc cF CC(=O)O c=O c[n&H1]c csc cC=O CNC \\\n", + "0 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "\n", + " CN CN(-,:C)C cC#N cn(-,:c)C cC(=O)O CC=C(-,:C)C CC#N cNC(-,:C)=O \\\n", + "0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 \n", + "5 1 0 0 0 0 0 0 0 \n", + "\n", + " cNC C/C=C/C CC=CC C=C(-,:C)C C#CC cC(-,:c)=O cN(-,:C)C CC(=O)OC \\\n", + "0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 \n", + "\n", + " CC#CC cI CNC(-,:C)=O cC=Cc c-n(-,:c)c cnn(-,:c)C cnnc cP(-,:c)c CS \\\n", + "0 0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 0 \n", + "\n", + " spectrum (W)-CO (W)-cOC (W)-COC(-,:C)=O (W)-cnc (W)-cCl (W)-cO \\\n", + "0 None 0.567877 7.202310 0.540732 0.532957 0.53604 0.528167 \n", + "3 None 0.567877 7.202310 0.540732 0.532957 0.53604 0.528167 \n", + "5 None 0.567877 0.541339 0.540732 0.532957 0.53604 0.528167 \n", + "\n", + " (W)-CCl (W)-CC(-,:C)=O (W)-cBr (W)-c[N&+](=O)[O&-] (W)-cC(-,:C)=O \\\n", + "0 0.526322 0.521537 0.516949 0.519961 15.059375 \n", + "3 0.526322 0.521537 0.516949 14.326757 0.518953 \n", + "5 0.526322 0.521537 0.516949 0.519961 0.518953 \n", + "\n", + " (W)-C=CC (W)-cN (W)-cC(=O)OC (W)-COC (W)-CF (W)-CBr (W)-coc \\\n", + "0 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "3 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "5 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "\n", + " (W)-cF (W)-CC(=O)O (W)-c=O (W)-c[n&H1]c (W)-csc (W)-cC=O \\\n", + "0 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "3 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "5 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "\n", + " (W)-CNC (W)-CN (W)-CN(-,:C)C (W)-cC#N (W)-cn(-,:c)C (W)-cC(=O)O \\\n", + "0 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "3 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "5 0.508441 32.721605 0.508441 0.505985 0.507156 0.50705 \n", + "\n", + " (W)-CC=C(-,:C)C (W)-CC#N (W)-cNC(-,:C)=O (W)-cNC (W)-C/C=C/C \\\n", + "0 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "3 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "5 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "\n", + " (W)-CC=CC (W)-C=C(-,:C)C (W)-C#CC (W)-cC(-,:c)=O (W)-cN(-,:C)C \\\n", + "0 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "3 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "5 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "\n", + " (W)-CC(=O)OC (W)-CC#CC (W)-cI (W)-CNC(-,:C)=O (W)-cC=Cc \\\n", + "0 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "3 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "5 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "\n", + " (W)-c-n(-,:c)c (W)-cnn(-,:c)C (W)-cnnc (W)-cP(-,:c)c (W)-CS \n", + "0 0.501979 0.503132 0.502398 0.502922 0.502398 \n", + "3 0.501979 0.503132 0.502398 0.502922 0.502398 \n", + "5 0.501979 0.503132 0.502398 0.502922 0.502398 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loaded_train_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fncano_smiCOcOCCOC(-,:C)=OcnccClcOCClCC(-,:C)=OcBrc[N&+](=O)[O&-]cC(-,:C)=OC=CCcNcC(=O)OCCOCCFCBrcoccFCC(=O)Oc=Oc[n&H1]ccsccC=OCNCCNCN(-,:C)CcC#Ncn(-,:c)CcC(=O)OCC=C(-,:C)CCC#NcNC(-,:C)=OcNCC/C=C/CCC=CCC=C(-,:C)CC#CCcC(-,:c)=OcN(-,:C)CCC(=O)OCCC#CCcICNC(-,:C)=OcC=Ccc-n(-,:c)ccnn(-,:c)CcnnccP(-,:c)cCSspectrum(W)-CO(W)-cOC(W)-COC(-,:C)=O(W)-cnc(W)-cCl(W)-cO(W)-CCl(W)-CC(-,:C)=O(W)-cBr(W)-c[N&+](=O)[O&-](W)-cC(-,:C)=O(W)-C=CC(W)-cN(W)-cC(=O)OC(W)-COC(W)-CF(W)-CBr(W)-coc(W)-cF(W)-CC(=O)O(W)-c=O(W)-c[n&H1]c(W)-csc(W)-cC=O(W)-CNC(W)-CN(W)-CN(-,:C)C(W)-cC#N(W)-cn(-,:c)C(W)-cC(=O)O(W)-CC=C(-,:C)C(W)-CC#N(W)-cNC(-,:C)=O(W)-cNC(W)-C/C=C/C(W)-CC=CC(W)-C=C(-,:C)C(W)-C#CC(W)-cC(-,:c)=O(W)-cN(-,:C)C(W)-CC(=O)OC(W)-CC#CC(W)-cI(W)-CNC(-,:C)=O(W)-cC=Cc(W)-c-n(-,:c)c(W)-cnn(-,:c)C(W)-cnnc(W)-cP(-,:c)c(W)-CS
6288a5e81bc8-7cd1-4603-bf77-d6635351f25bCC1=CC[C@@H](C2(C)CC2(C)C(=O)OC(C)(C)C)CC100100000000000000000000000000010000000000000000000[0.9202703349311558, 0.9234971350037704, 0.926...0.5678770.5413397.3015150.5329570.536040.5281670.5263220.5215370.5169490.5199610.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.5070540.7761540.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
6289427ee4cc-e725-4b40-829d-51b14984e029Brc1cnc2ccccc2n100010000100000000000000000000000000000000000000000[0.6751178997222926, 0.6719449948267988, 0.668...0.5678770.5413390.5407328.8941280.536040.5281670.5263220.52153716.7750000.5199610.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
6290dcf80c78-98e2-4ef9-b8a8-bc259aa8f9a4CCCC[Sn](CCCC)(CCCC)c1cnc2ccccc2n100010000000000000000000000000000000000000000000000[0.5719682556458464, 0.5722615937762792, 0.571...0.5678770.5413390.5407328.8941280.536040.5281670.5263220.5215370.5169490.5199610.5189530.5191770.5182840.5161740.5161740.514740.5155110.5124420.5133150.5112450.5105950.5095160.5089780.5082260.5084410.5085480.5084410.5059850.5071560.507050.5068360.506730.5057720.5058790.5053480.5059850.5056660.5056660.5053480.505560.5052420.5051360.5047130.5041850.5040790.5019790.5031320.5023980.5029220.502398
\n", + "
" + ], + "text/plain": [ + " fn \\\n", + "6288 a5e81bc8-7cd1-4603-bf77-d6635351f25b \n", + "6289 427ee4cc-e725-4b40-829d-51b14984e029 \n", + "6290 dcf80c78-98e2-4ef9-b8a8-bc259aa8f9a4 \n", + "\n", + " cano_smi CO cOC COC(-,:C)=O cnc \\\n", + "6288 CC1=CC[C@@H](C2(C)CC2(C)C(=O)OC(C)(C)C)CC1 0 0 1 0 \n", + "6289 Brc1cnc2ccccc2n1 0 0 0 1 \n", + "6290 CCCC[Sn](CCCC)(CCCC)c1cnc2ccccc2n1 0 0 0 1 \n", + "\n", + " cCl cO CCl CC(-,:C)=O cBr c[N&+](=O)[O&-] cC(-,:C)=O C=CC cN \\\n", + "6288 0 0 0 0 0 0 0 0 0 \n", + "6289 0 0 0 0 1 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 0 \n", + "\n", + " cC(=O)OC COC CF CBr coc cF CC(=O)O c=O c[n&H1]c csc cC=O CNC \\\n", + "6288 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "6289 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 0 0 0 0 \n", + "\n", + " CN CN(-,:C)C cC#N cn(-,:c)C cC(=O)O CC=C(-,:C)C CC#N cNC(-,:C)=O \\\n", + "6288 0 0 0 0 0 1 0 0 \n", + "6289 0 0 0 0 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 \n", + "\n", + " cNC C/C=C/C CC=CC C=C(-,:C)C C#CC cC(-,:c)=O cN(-,:C)C CC(=O)OC \\\n", + "6288 0 0 0 0 0 0 0 0 \n", + "6289 0 0 0 0 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 \n", + "\n", + " CC#CC cI CNC(-,:C)=O cC=Cc c-n(-,:c)c cnn(-,:c)C cnnc cP(-,:c)c \\\n", + "6288 0 0 0 0 0 0 0 0 \n", + "6289 0 0 0 0 0 0 0 0 \n", + "6290 0 0 0 0 0 0 0 0 \n", + "\n", + " CS spectrum (W)-CO \\\n", + "6288 0 [0.9202703349311558, 0.9234971350037704, 0.926... 0.567877 \n", + "6289 0 [0.6751178997222926, 0.6719449948267988, 0.668... 0.567877 \n", + "6290 0 [0.5719682556458464, 0.5722615937762792, 0.571... 0.567877 \n", + "\n", + " (W)-cOC (W)-COC(-,:C)=O (W)-cnc (W)-cCl (W)-cO (W)-CCl \\\n", + "6288 0.541339 7.301515 0.532957 0.53604 0.528167 0.526322 \n", + "6289 0.541339 0.540732 8.894128 0.53604 0.528167 0.526322 \n", + "6290 0.541339 0.540732 8.894128 0.53604 0.528167 0.526322 \n", + "\n", + " (W)-CC(-,:C)=O (W)-cBr (W)-c[N&+](=O)[O&-] (W)-cC(-,:C)=O \\\n", + "6288 0.521537 0.516949 0.519961 0.518953 \n", + "6289 0.521537 16.775000 0.519961 0.518953 \n", + "6290 0.521537 0.516949 0.519961 0.518953 \n", + "\n", + " (W)-C=CC (W)-cN (W)-cC(=O)OC (W)-COC (W)-CF (W)-CBr (W)-coc \\\n", + "6288 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "6289 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "6290 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n", + "\n", + " (W)-cF (W)-CC(=O)O (W)-c=O (W)-c[n&H1]c (W)-csc (W)-cC=O \\\n", + "6288 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "6289 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "6290 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n", + "\n", + " (W)-CNC (W)-CN (W)-CN(-,:C)C (W)-cC#N (W)-cn(-,:c)C (W)-cC(=O)O \\\n", + "6288 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "6289 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "6290 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n", + "\n", + " (W)-CC=C(-,:C)C (W)-CC#N (W)-cNC(-,:C)=O (W)-cNC (W)-C/C=C/C \\\n", + "6288 40.776154 0.50673 0.505772 0.505879 0.505348 \n", + "6289 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "6290 0.506836 0.50673 0.505772 0.505879 0.505348 \n", + "\n", + " (W)-CC=CC (W)-C=C(-,:C)C (W)-C#CC (W)-cC(-,:c)=O (W)-cN(-,:C)C \\\n", + "6288 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "6289 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "6290 0.505985 0.505666 0.505666 0.505348 0.50556 \n", + "\n", + " (W)-CC(=O)OC (W)-CC#CC (W)-cI (W)-CNC(-,:C)=O (W)-cC=Cc \\\n", + "6288 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "6289 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "6290 0.505242 0.505136 0.504713 0.504185 0.504079 \n", + "\n", + " (W)-c-n(-,:c)c (W)-cnn(-,:c)C (W)-cnnc (W)-cP(-,:c)c (W)-CS \n", + "6288 0.501979 0.503132 0.502398 0.502922 0.502398 \n", + "6289 0.501979 0.503132 0.502398 0.502922 0.502398 \n", + "6290 0.501979 0.503132 0.502398 0.502922 0.502398 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loaded_train_df.tail(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(loaded_test_df.iloc[-1:]['spectrum'].values[0][::-1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Dataset composition" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### These datasets contain NIST `basic information` & KIT complat in-house `spectra`.\n", + "\n", + "Since we are not in the position to distribute NIST spectra, saved dataframes in this repository only have: \n", + "\n", + "1. NIST filenames and canonic SMILES without spectra.\n", + "2. KIT complat in-house filenames and canonic SMILES with spectra in the `spectrum` column.\n", + "\n", + "- Therefore, we can know how many NIST and in-house SMILES by checking the `spectrum` column.\n", + "- After you get NIST spectra, NIST spectra will be loaded in the next notebook.\n", + "- To purchase NIST spectra : __TBD__." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training dataset count 4819 / NIST 3930 / complat 889\n", + "Validation dataset count 733 / NIST 557 / complat 176\n", + "Testing dataset count 739 / NIST 579 / complat 160\n" + ] + } + ], + "source": [ + "print('Training dataset count {} / NIST {} / complat {}'.format(loaded_train_df.shape[0], loaded_train_df['spectrum'].isnull().sum(), loaded_train_df.shape[0] - loaded_train_df['spectrum'].isnull().sum()))\n", + "print('Validation dataset count {} / NIST {} / complat {}'.format(loaded_valid_df.shape[0], loaded_valid_df['spectrum'].isnull().sum(), loaded_valid_df.shape[0] - loaded_valid_df['spectrum'].isnull().sum()))\n", + "print('Testing dataset count {} / NIST {} / complat {}'.format(loaded_test_df.shape[0], loaded_test_df['spectrum'].isnull().sum(), loaded_test_df.shape[0] - loaded_test_df['spectrum'].isnull().sum()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Function groups" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def clear_mapnum(mol):\n", + " [\n", + " atom.ClearProp('molAtomMapNumber')\n", + " for atom in mol.GetAtoms()\n", + " if atom.HasProp('molAtomMapNumber')\n", + " ]\n", + "\n", + "def extract_fgs(mol):\n", + " fg_smas = []\n", + " fgs = identify_functional_groups(mol)\n", + "\n", + " for fg in fgs:\n", + " target = fg.type\n", + " mol = Chem.MolFromSmarts(target)\n", + " clear_mapnum(mol)\n", + " sma = Chem.MolToSmarts(mol)\n", + " fg_smas.append(sma)\n", + "\n", + " return list(set(fg_smas))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "fg_dict = {}\n", + "for idx, csmi in enumerate(loaded_train_df['cano_smi']):\n", + " m = Chem.MolFromSmiles(csmi)\n", + " fgs = extract_fgs(m)\n", + " for fg in fgs:\n", + " if fg in fg_dict:\n", + " fg_dict[fg] += 1\n", + " else:\n", + " fg_dict[fg] = 1\n", + "\n", + "for idx, csmi in enumerate(loaded_valid_df['cano_smi']):\n", + " m = Chem.MolFromSmiles(csmi)\n", + " fgs = extract_fgs(m)\n", + " for fg in fgs:\n", + " if fg in fg_dict:\n", + " fg_dict[fg] += 1\n", + " else:\n", + " fg_dict[fg] = 1\n", + " \n", + "for idx, csmi in enumerate(loaded_test_df['cano_smi']):\n", + " m = Chem.MolFromSmiles(csmi)\n", + " fgs = extract_fgs(m)\n", + " for fg in fgs:\n", + " if fg in fg_dict:\n", + " fg_dict[fg] += 1\n", + " else:\n", + " fg_dict[fg] = 1\n", + " \n", + "fg_count_from_mol = dict(sorted(fg_dict.items(), key=lambda item: item[1], reverse=True))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "col_nms = loaded_train_df.columns[2:52]\n", + "fg_count_from_df = {}\n", + "for cn in col_nms:\n", + " count = loaded_train_df[cn].sum() + loaded_valid_df[cn].sum() + loaded_test_df[cn].sum()\n", + " fg_count_from_df[cn] = count" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "functional group SMARTS / count in the dataframe\n", + "CO / 765\n", + "cOC / 523\n", + "COC(-,:C)=O / 502\n", + "cnc / 440\n", + "cCl / 436\n", + "cO / 349\n", + "CCl / 318\n", + "CC(-,:C)=O / 263\n", + "cBr / 230\n", + "c[N&+](=O)[O&-] / 245\n", + "cC(-,:C)=O / 235\n", + "C=CC / 237\n", + "cN / 224\n", + "cC(=O)OC / 210\n", + "COC / 205\n", + "CF / 198\n", + "CBr / 195\n", + "coc / 169\n", + "cF / 171\n", + "CC(=O)O / 145\n", + "c=O / 135\n", + "c[n&H1]c / 124\n", + "csc / 113\n", + "cC=O / 105\n", + "CNC / 110\n", + "CN / 105\n", + "CN(-,:C)C / 105\n", + "cC#N / 82\n", + "cn(-,:c)C / 93\n", + "cC(=O)O / 89\n", + "CC=C(-,:C)C / 86\n", + "CC#N / 86\n", + "cNC(-,:C)=O / 79\n", + "cNC / 76\n", + "C/C=C/C / 66\n", + "CC=CC / 77\n", + "C=C(-,:C)C / 71\n", + "C#CC / 71\n", + "cC(-,:c)=O / 70\n", + "cN(-,:C)C / 71\n", + "CC(=O)OC / 69\n", + "CC#CC / 67\n", + "cI / 62\n", + "CNC(-,:C)=O / 56\n", + "cC=Cc / 54\n", + "c-n(-,:c)c / 37\n", + "cnn(-,:c)C / 46\n", + "cnnc / 32\n", + "cP(-,:c)c / 44\n", + "CS / 39\n" + ] + } + ], + "source": [ + "print('functional group SMARTS / count in the dataframe')\n", + "for cn in col_nms:\n", + " print('{} / {}'.format(cn, fg_count_from_df[cn]))\n", + " if fg_count_from_df[cn] != fg_count_from_mol[cn]:\n", + " print('- - - - - {} has wrong count between dataframe and IFG extraction.'.format(cn))\n", + " print(cn)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(col_nms)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "chem-dl-ir", + "language": "python", + "name": "chem-dl-ir" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/training_spectrum_to_fgs/INSTALL.md b/training_spectrum_to_fgs/INSTALL.md index b040cdd..e433e88 100644 --- a/training_spectrum_to_fgs/INSTALL.md +++ b/training_spectrum_to_fgs/INSTALL.md @@ -45,22 +45,19 @@ $ pip install numpy pandas matplotlib scikit-learn scipy Flask git+https://githu $ python -m ipykernel install --user --name=deep-ir-01 ``` - # 2. Training and Validation 1. To get nist data, you should buy it from `https://www.nist.gov/srd/nist-standard-reference-database-35`, and put & rename the folder to `./data/nist/`. ![nist data](https://github.com/JasonYCHuang/chem-dl-ir/blob/master/training_spectrum_to_fgs/assets/nist.jpg) - 2. run following files one-by-one - ``` +00_explanation_nist_and_in_house_dataset.ipynb 01_load_data.ipynb 02_train_model.ipynb 03_verify_model_and_save_acc.ipynb 04_count_spectra.ipynb ``` -