diff --git a/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb b/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb
new file mode 100644
index 0000000..5355bef
--- /dev/null
+++ b/training_spectrum_to_fgs/00_explanation_nist_and_in_house_dataset.ipynb
@@ -0,0 +1,1557 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 0. Load python libs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import glob\n",
+ "import json\n",
+ "import nmrglue as ng\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pickle as pickle\n",
+ "import pandas as pd\n",
+ "import random\n",
+ "import math\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from collections import Counter, defaultdict, OrderedDict\n",
+ "\n",
+ "from scipy import interpolate\n",
+ "\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from lib.carrier import SpectraCarrier\n",
+ "import lib.utils as utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2020.09.1\n"
+ ]
+ }
+ ],
+ "source": [
+ "import rdkit\n",
+ "print(rdkit.__version__)\n",
+ "\n",
+ "from rdkit import Chem\n",
+ "from rdkit import RDLogger\n",
+ "RDLogger.DisableLog('rdApp.*')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from lib.ifg import identify_functional_groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. Load dataframes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_DIR = './data'\n",
+ "SOURCE_DIR = './data/source'\n",
+ "TARGET_DIR = './data/target'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fn_train_df = '{}/train_df.pk'.format(SOURCE_DIR)\n",
+ "fn_valid_df = '{}/valid_df.pk'.format(SOURCE_DIR)\n",
+ "fn_test_df = '{}/test_df.pk'.format(SOURCE_DIR)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loaded_train_df = None\n",
+ "with open(fn_train_df, 'rb') as file:\n",
+ " loaded_train_df = pickle.load(file)\n",
+ " \n",
+ "loaded_valid_df = None\n",
+ "with open(fn_valid_df, 'rb') as file:\n",
+ " loaded_valid_df = pickle.load(file)\n",
+ " \n",
+ "loaded_test_df = None\n",
+ "with open(fn_test_df, 'rb') as file:\n",
+ " loaded_test_df = pickle.load(file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_columns', None)\n",
+ "pd.set_option('display.max_rows', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " fn | \n",
+ " cano_smi | \n",
+ " CO | \n",
+ " cOC | \n",
+ " COC(-,:C)=O | \n",
+ " cnc | \n",
+ " cCl | \n",
+ " cO | \n",
+ " CCl | \n",
+ " CC(-,:C)=O | \n",
+ " cBr | \n",
+ " c[N&+](=O)[O&-] | \n",
+ " cC(-,:C)=O | \n",
+ " C=CC | \n",
+ " cN | \n",
+ " cC(=O)OC | \n",
+ " COC | \n",
+ " CF | \n",
+ " CBr | \n",
+ " coc | \n",
+ " cF | \n",
+ " CC(=O)O | \n",
+ " c=O | \n",
+ " c[n&H1]c | \n",
+ " csc | \n",
+ " cC=O | \n",
+ " CNC | \n",
+ " CN | \n",
+ " CN(-,:C)C | \n",
+ " cC#N | \n",
+ " cn(-,:c)C | \n",
+ " cC(=O)O | \n",
+ " CC=C(-,:C)C | \n",
+ " CC#N | \n",
+ " cNC(-,:C)=O | \n",
+ " cNC | \n",
+ " C/C=C/C | \n",
+ " CC=CC | \n",
+ " C=C(-,:C)C | \n",
+ " C#CC | \n",
+ " cC(-,:c)=O | \n",
+ " cN(-,:C)C | \n",
+ " CC(=O)OC | \n",
+ " CC#CC | \n",
+ " cI | \n",
+ " CNC(-,:C)=O | \n",
+ " cC=Cc | \n",
+ " c-n(-,:c)c | \n",
+ " cnn(-,:c)C | \n",
+ " cnnc | \n",
+ " cP(-,:c)c | \n",
+ " CS | \n",
+ " spectrum | \n",
+ " (W)-CO | \n",
+ " (W)-cOC | \n",
+ " (W)-COC(-,:C)=O | \n",
+ " (W)-cnc | \n",
+ " (W)-cCl | \n",
+ " (W)-cO | \n",
+ " (W)-CCl | \n",
+ " (W)-CC(-,:C)=O | \n",
+ " (W)-cBr | \n",
+ " (W)-c[N&+](=O)[O&-] | \n",
+ " (W)-cC(-,:C)=O | \n",
+ " (W)-C=CC | \n",
+ " (W)-cN | \n",
+ " (W)-cC(=O)OC | \n",
+ " (W)-COC | \n",
+ " (W)-CF | \n",
+ " (W)-CBr | \n",
+ " (W)-coc | \n",
+ " (W)-cF | \n",
+ " (W)-CC(=O)O | \n",
+ " (W)-c=O | \n",
+ " (W)-c[n&H1]c | \n",
+ " (W)-csc | \n",
+ " (W)-cC=O | \n",
+ " (W)-CNC | \n",
+ " (W)-CN | \n",
+ " (W)-CN(-,:C)C | \n",
+ " (W)-cC#N | \n",
+ " (W)-cn(-,:c)C | \n",
+ " (W)-cC(=O)O | \n",
+ " (W)-CC=C(-,:C)C | \n",
+ " (W)-CC#N | \n",
+ " (W)-cNC(-,:C)=O | \n",
+ " (W)-cNC | \n",
+ " (W)-C/C=C/C | \n",
+ " (W)-CC=CC | \n",
+ " (W)-C=C(-,:C)C | \n",
+ " (W)-C#CC | \n",
+ " (W)-cC(-,:c)=O | \n",
+ " (W)-cN(-,:C)C | \n",
+ " (W)-CC(=O)OC | \n",
+ " (W)-CC#CC | \n",
+ " (W)-cI | \n",
+ " (W)-CNC(-,:C)=O | \n",
+ " (W)-cC=Cc | \n",
+ " (W)-c-n(-,:c)c | \n",
+ " (W)-cnn(-,:c)C | \n",
+ " (W)-cnnc | \n",
+ " (W)-cP(-,:c)c | \n",
+ " (W)-CS | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1136-86-3 | \n",
+ " COc1cc(C(C)=O)cc(OC)c1OC | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " None | \n",
+ " 0.567877 | \n",
+ " 7.202310 | \n",
+ " 0.540732 | \n",
+ " 0.532957 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 0.519961 | \n",
+ " 15.059375 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 610-54-8 | \n",
+ " CCOc1ccc([N+](=O)[O-])cc1[N+](=O)[O-] | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " None | \n",
+ " 0.567877 | \n",
+ " 7.202310 | \n",
+ " 0.540732 | \n",
+ " 0.532957 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 14.326757 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 645-36-3 | \n",
+ " CCOC(CN)OCC | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " None | \n",
+ " 0.567877 | \n",
+ " 0.541339 | \n",
+ " 0.540732 | \n",
+ " 0.532957 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 0.519961 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 32.721605 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " fn cano_smi CO cOC COC(-,:C)=O \\\n",
+ "0 1136-86-3 COc1cc(C(C)=O)cc(OC)c1OC 0 1 0 \n",
+ "3 610-54-8 CCOc1ccc([N+](=O)[O-])cc1[N+](=O)[O-] 0 1 0 \n",
+ "5 645-36-3 CCOC(CN)OCC 0 0 0 \n",
+ "\n",
+ " cnc cCl cO CCl CC(-,:C)=O cBr c[N&+](=O)[O&-] cC(-,:C)=O C=CC cN \\\n",
+ "0 0 0 0 0 0 0 0 1 0 0 \n",
+ "3 0 0 0 0 0 0 1 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " cC(=O)OC COC CF CBr coc cF CC(=O)O c=O c[n&H1]c csc cC=O CNC \\\n",
+ "0 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CN CN(-,:C)C cC#N cn(-,:c)C cC(=O)O CC=C(-,:C)C CC#N cNC(-,:C)=O \\\n",
+ "0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 \n",
+ "5 1 0 0 0 0 0 0 0 \n",
+ "\n",
+ " cNC C/C=C/C CC=CC C=C(-,:C)C C#CC cC(-,:c)=O cN(-,:C)C CC(=O)OC \\\n",
+ "0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CC#CC cI CNC(-,:C)=O cC=Cc c-n(-,:c)c cnn(-,:c)C cnnc cP(-,:c)c CS \\\n",
+ "0 0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " spectrum (W)-CO (W)-cOC (W)-COC(-,:C)=O (W)-cnc (W)-cCl (W)-cO \\\n",
+ "0 None 0.567877 7.202310 0.540732 0.532957 0.53604 0.528167 \n",
+ "3 None 0.567877 7.202310 0.540732 0.532957 0.53604 0.528167 \n",
+ "5 None 0.567877 0.541339 0.540732 0.532957 0.53604 0.528167 \n",
+ "\n",
+ " (W)-CCl (W)-CC(-,:C)=O (W)-cBr (W)-c[N&+](=O)[O&-] (W)-cC(-,:C)=O \\\n",
+ "0 0.526322 0.521537 0.516949 0.519961 15.059375 \n",
+ "3 0.526322 0.521537 0.516949 14.326757 0.518953 \n",
+ "5 0.526322 0.521537 0.516949 0.519961 0.518953 \n",
+ "\n",
+ " (W)-C=CC (W)-cN (W)-cC(=O)OC (W)-COC (W)-CF (W)-CBr (W)-coc \\\n",
+ "0 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "3 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "5 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "\n",
+ " (W)-cF (W)-CC(=O)O (W)-c=O (W)-c[n&H1]c (W)-csc (W)-cC=O \\\n",
+ "0 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "3 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "5 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "\n",
+ " (W)-CNC (W)-CN (W)-CN(-,:C)C (W)-cC#N (W)-cn(-,:c)C (W)-cC(=O)O \\\n",
+ "0 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "3 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "5 0.508441 32.721605 0.508441 0.505985 0.507156 0.50705 \n",
+ "\n",
+ " (W)-CC=C(-,:C)C (W)-CC#N (W)-cNC(-,:C)=O (W)-cNC (W)-C/C=C/C \\\n",
+ "0 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "3 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "5 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "\n",
+ " (W)-CC=CC (W)-C=C(-,:C)C (W)-C#CC (W)-cC(-,:c)=O (W)-cN(-,:C)C \\\n",
+ "0 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "3 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "5 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "\n",
+ " (W)-CC(=O)OC (W)-CC#CC (W)-cI (W)-CNC(-,:C)=O (W)-cC=Cc \\\n",
+ "0 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "3 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "5 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "\n",
+ " (W)-c-n(-,:c)c (W)-cnn(-,:c)C (W)-cnnc (W)-cP(-,:c)c (W)-CS \n",
+ "0 0.501979 0.503132 0.502398 0.502922 0.502398 \n",
+ "3 0.501979 0.503132 0.502398 0.502922 0.502398 \n",
+ "5 0.501979 0.503132 0.502398 0.502922 0.502398 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loaded_train_df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " fn | \n",
+ " cano_smi | \n",
+ " CO | \n",
+ " cOC | \n",
+ " COC(-,:C)=O | \n",
+ " cnc | \n",
+ " cCl | \n",
+ " cO | \n",
+ " CCl | \n",
+ " CC(-,:C)=O | \n",
+ " cBr | \n",
+ " c[N&+](=O)[O&-] | \n",
+ " cC(-,:C)=O | \n",
+ " C=CC | \n",
+ " cN | \n",
+ " cC(=O)OC | \n",
+ " COC | \n",
+ " CF | \n",
+ " CBr | \n",
+ " coc | \n",
+ " cF | \n",
+ " CC(=O)O | \n",
+ " c=O | \n",
+ " c[n&H1]c | \n",
+ " csc | \n",
+ " cC=O | \n",
+ " CNC | \n",
+ " CN | \n",
+ " CN(-,:C)C | \n",
+ " cC#N | \n",
+ " cn(-,:c)C | \n",
+ " cC(=O)O | \n",
+ " CC=C(-,:C)C | \n",
+ " CC#N | \n",
+ " cNC(-,:C)=O | \n",
+ " cNC | \n",
+ " C/C=C/C | \n",
+ " CC=CC | \n",
+ " C=C(-,:C)C | \n",
+ " C#CC | \n",
+ " cC(-,:c)=O | \n",
+ " cN(-,:C)C | \n",
+ " CC(=O)OC | \n",
+ " CC#CC | \n",
+ " cI | \n",
+ " CNC(-,:C)=O | \n",
+ " cC=Cc | \n",
+ " c-n(-,:c)c | \n",
+ " cnn(-,:c)C | \n",
+ " cnnc | \n",
+ " cP(-,:c)c | \n",
+ " CS | \n",
+ " spectrum | \n",
+ " (W)-CO | \n",
+ " (W)-cOC | \n",
+ " (W)-COC(-,:C)=O | \n",
+ " (W)-cnc | \n",
+ " (W)-cCl | \n",
+ " (W)-cO | \n",
+ " (W)-CCl | \n",
+ " (W)-CC(-,:C)=O | \n",
+ " (W)-cBr | \n",
+ " (W)-c[N&+](=O)[O&-] | \n",
+ " (W)-cC(-,:C)=O | \n",
+ " (W)-C=CC | \n",
+ " (W)-cN | \n",
+ " (W)-cC(=O)OC | \n",
+ " (W)-COC | \n",
+ " (W)-CF | \n",
+ " (W)-CBr | \n",
+ " (W)-coc | \n",
+ " (W)-cF | \n",
+ " (W)-CC(=O)O | \n",
+ " (W)-c=O | \n",
+ " (W)-c[n&H1]c | \n",
+ " (W)-csc | \n",
+ " (W)-cC=O | \n",
+ " (W)-CNC | \n",
+ " (W)-CN | \n",
+ " (W)-CN(-,:C)C | \n",
+ " (W)-cC#N | \n",
+ " (W)-cn(-,:c)C | \n",
+ " (W)-cC(=O)O | \n",
+ " (W)-CC=C(-,:C)C | \n",
+ " (W)-CC#N | \n",
+ " (W)-cNC(-,:C)=O | \n",
+ " (W)-cNC | \n",
+ " (W)-C/C=C/C | \n",
+ " (W)-CC=CC | \n",
+ " (W)-C=C(-,:C)C | \n",
+ " (W)-C#CC | \n",
+ " (W)-cC(-,:c)=O | \n",
+ " (W)-cN(-,:C)C | \n",
+ " (W)-CC(=O)OC | \n",
+ " (W)-CC#CC | \n",
+ " (W)-cI | \n",
+ " (W)-CNC(-,:C)=O | \n",
+ " (W)-cC=Cc | \n",
+ " (W)-c-n(-,:c)c | \n",
+ " (W)-cnn(-,:c)C | \n",
+ " (W)-cnnc | \n",
+ " (W)-cP(-,:c)c | \n",
+ " (W)-CS | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6288 | \n",
+ " a5e81bc8-7cd1-4603-bf77-d6635351f25b | \n",
+ " CC1=CC[C@@H](C2(C)CC2(C)C(=O)OC(C)(C)C)CC1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " [0.9202703349311558, 0.9234971350037704, 0.926... | \n",
+ " 0.567877 | \n",
+ " 0.541339 | \n",
+ " 7.301515 | \n",
+ " 0.532957 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 0.519961 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 40.776154 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ " 6289 | \n",
+ " 427ee4cc-e725-4b40-829d-51b14984e029 | \n",
+ " Brc1cnc2ccccc2n1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " [0.6751178997222926, 0.6719449948267988, 0.668... | \n",
+ " 0.567877 | \n",
+ " 0.541339 | \n",
+ " 0.540732 | \n",
+ " 8.894128 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 16.775000 | \n",
+ " 0.519961 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ " 6290 | \n",
+ " dcf80c78-98e2-4ef9-b8a8-bc259aa8f9a4 | \n",
+ " CCCC[Sn](CCCC)(CCCC)c1cnc2ccccc2n1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " [0.5719682556458464, 0.5722615937762792, 0.571... | \n",
+ " 0.567877 | \n",
+ " 0.541339 | \n",
+ " 0.540732 | \n",
+ " 8.894128 | \n",
+ " 0.53604 | \n",
+ " 0.528167 | \n",
+ " 0.526322 | \n",
+ " 0.521537 | \n",
+ " 0.516949 | \n",
+ " 0.519961 | \n",
+ " 0.518953 | \n",
+ " 0.519177 | \n",
+ " 0.518284 | \n",
+ " 0.516174 | \n",
+ " 0.516174 | \n",
+ " 0.51474 | \n",
+ " 0.515511 | \n",
+ " 0.512442 | \n",
+ " 0.513315 | \n",
+ " 0.511245 | \n",
+ " 0.510595 | \n",
+ " 0.509516 | \n",
+ " 0.508978 | \n",
+ " 0.508226 | \n",
+ " 0.508441 | \n",
+ " 0.508548 | \n",
+ " 0.508441 | \n",
+ " 0.505985 | \n",
+ " 0.507156 | \n",
+ " 0.50705 | \n",
+ " 0.506836 | \n",
+ " 0.50673 | \n",
+ " 0.505772 | \n",
+ " 0.505879 | \n",
+ " 0.505348 | \n",
+ " 0.505985 | \n",
+ " 0.505666 | \n",
+ " 0.505666 | \n",
+ " 0.505348 | \n",
+ " 0.50556 | \n",
+ " 0.505242 | \n",
+ " 0.505136 | \n",
+ " 0.504713 | \n",
+ " 0.504185 | \n",
+ " 0.504079 | \n",
+ " 0.501979 | \n",
+ " 0.503132 | \n",
+ " 0.502398 | \n",
+ " 0.502922 | \n",
+ " 0.502398 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " fn \\\n",
+ "6288 a5e81bc8-7cd1-4603-bf77-d6635351f25b \n",
+ "6289 427ee4cc-e725-4b40-829d-51b14984e029 \n",
+ "6290 dcf80c78-98e2-4ef9-b8a8-bc259aa8f9a4 \n",
+ "\n",
+ " cano_smi CO cOC COC(-,:C)=O cnc \\\n",
+ "6288 CC1=CC[C@@H](C2(C)CC2(C)C(=O)OC(C)(C)C)CC1 0 0 1 0 \n",
+ "6289 Brc1cnc2ccccc2n1 0 0 0 1 \n",
+ "6290 CCCC[Sn](CCCC)(CCCC)c1cnc2ccccc2n1 0 0 0 1 \n",
+ "\n",
+ " cCl cO CCl CC(-,:C)=O cBr c[N&+](=O)[O&-] cC(-,:C)=O C=CC cN \\\n",
+ "6288 0 0 0 0 0 0 0 0 0 \n",
+ "6289 0 0 0 0 1 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " cC(=O)OC COC CF CBr coc cF CC(=O)O c=O c[n&H1]c csc cC=O CNC \\\n",
+ "6288 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "6289 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CN CN(-,:C)C cC#N cn(-,:c)C cC(=O)O CC=C(-,:C)C CC#N cNC(-,:C)=O \\\n",
+ "6288 0 0 0 0 0 1 0 0 \n",
+ "6289 0 0 0 0 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " cNC C/C=C/C CC=CC C=C(-,:C)C C#CC cC(-,:c)=O cN(-,:C)C CC(=O)OC \\\n",
+ "6288 0 0 0 0 0 0 0 0 \n",
+ "6289 0 0 0 0 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CC#CC cI CNC(-,:C)=O cC=Cc c-n(-,:c)c cnn(-,:c)C cnnc cP(-,:c)c \\\n",
+ "6288 0 0 0 0 0 0 0 0 \n",
+ "6289 0 0 0 0 0 0 0 0 \n",
+ "6290 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " CS spectrum (W)-CO \\\n",
+ "6288 0 [0.9202703349311558, 0.9234971350037704, 0.926... 0.567877 \n",
+ "6289 0 [0.6751178997222926, 0.6719449948267988, 0.668... 0.567877 \n",
+ "6290 0 [0.5719682556458464, 0.5722615937762792, 0.571... 0.567877 \n",
+ "\n",
+ " (W)-cOC (W)-COC(-,:C)=O (W)-cnc (W)-cCl (W)-cO (W)-CCl \\\n",
+ "6288 0.541339 7.301515 0.532957 0.53604 0.528167 0.526322 \n",
+ "6289 0.541339 0.540732 8.894128 0.53604 0.528167 0.526322 \n",
+ "6290 0.541339 0.540732 8.894128 0.53604 0.528167 0.526322 \n",
+ "\n",
+ " (W)-CC(-,:C)=O (W)-cBr (W)-c[N&+](=O)[O&-] (W)-cC(-,:C)=O \\\n",
+ "6288 0.521537 0.516949 0.519961 0.518953 \n",
+ "6289 0.521537 16.775000 0.519961 0.518953 \n",
+ "6290 0.521537 0.516949 0.519961 0.518953 \n",
+ "\n",
+ " (W)-C=CC (W)-cN (W)-cC(=O)OC (W)-COC (W)-CF (W)-CBr (W)-coc \\\n",
+ "6288 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "6289 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "6290 0.519177 0.518284 0.516174 0.516174 0.51474 0.515511 0.512442 \n",
+ "\n",
+ " (W)-cF (W)-CC(=O)O (W)-c=O (W)-c[n&H1]c (W)-csc (W)-cC=O \\\n",
+ "6288 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "6289 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "6290 0.513315 0.511245 0.510595 0.509516 0.508978 0.508226 \n",
+ "\n",
+ " (W)-CNC (W)-CN (W)-CN(-,:C)C (W)-cC#N (W)-cn(-,:c)C (W)-cC(=O)O \\\n",
+ "6288 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "6289 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "6290 0.508441 0.508548 0.508441 0.505985 0.507156 0.50705 \n",
+ "\n",
+ " (W)-CC=C(-,:C)C (W)-CC#N (W)-cNC(-,:C)=O (W)-cNC (W)-C/C=C/C \\\n",
+ "6288 40.776154 0.50673 0.505772 0.505879 0.505348 \n",
+ "6289 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "6290 0.506836 0.50673 0.505772 0.505879 0.505348 \n",
+ "\n",
+ " (W)-CC=CC (W)-C=C(-,:C)C (W)-C#CC (W)-cC(-,:c)=O (W)-cN(-,:C)C \\\n",
+ "6288 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "6289 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "6290 0.505985 0.505666 0.505666 0.505348 0.50556 \n",
+ "\n",
+ " (W)-CC(=O)OC (W)-CC#CC (W)-cI (W)-CNC(-,:C)=O (W)-cC=Cc \\\n",
+ "6288 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "6289 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "6290 0.505242 0.505136 0.504713 0.504185 0.504079 \n",
+ "\n",
+ " (W)-c-n(-,:c)c (W)-cnn(-,:c)C (W)-cnnc (W)-cP(-,:c)c (W)-CS \n",
+ "6288 0.501979 0.503132 0.502398 0.502922 0.502398 \n",
+ "6289 0.501979 0.503132 0.502398 0.502922 0.502398 \n",
+ "6290 0.501979 0.503132 0.502398 0.502922 0.502398 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loaded_train_df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.plot(loaded_test_df.iloc[-1:]['spectrum'].values[0][::-1])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Dataset composition"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### These datasets contain NIST `basic information` & KIT complat in-house `spectra`.\n",
+ "\n",
+ "Since we are not in the position to distribute NIST spectra, saved dataframes in this repository only have: \n",
+ "\n",
+ "1. NIST filenames and canonic SMILES without spectra.\n",
+ "2. KIT complat in-house filenames and canonic SMILES with spectra in the `spectrum` column.\n",
+ "\n",
+ "- Therefore, we can know how many NIST and in-house SMILES by checking the `spectrum` column.\n",
+ "- After you get NIST spectra, NIST spectra will be loaded in the next notebook.\n",
+ "- To purchase NIST spectra : __TBD__."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training dataset count 4819 / NIST 3930 / complat 889\n",
+ "Validation dataset count 733 / NIST 557 / complat 176\n",
+ "Testing dataset count 739 / NIST 579 / complat 160\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Training dataset count {} / NIST {} / complat {}'.format(loaded_train_df.shape[0], loaded_train_df['spectrum'].isnull().sum(), loaded_train_df.shape[0] - loaded_train_df['spectrum'].isnull().sum()))\n",
+ "print('Validation dataset count {} / NIST {} / complat {}'.format(loaded_valid_df.shape[0], loaded_valid_df['spectrum'].isnull().sum(), loaded_valid_df.shape[0] - loaded_valid_df['spectrum'].isnull().sum()))\n",
+ "print('Testing dataset count {} / NIST {} / complat {}'.format(loaded_test_df.shape[0], loaded_test_df['spectrum'].isnull().sum(), loaded_test_df.shape[0] - loaded_test_df['spectrum'].isnull().sum()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3. Function groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clear_mapnum(mol):\n",
+ " [\n",
+ " atom.ClearProp('molAtomMapNumber')\n",
+ " for atom in mol.GetAtoms()\n",
+ " if atom.HasProp('molAtomMapNumber')\n",
+ " ]\n",
+ "\n",
+ "def extract_fgs(mol):\n",
+ " fg_smas = []\n",
+ " fgs = identify_functional_groups(mol)\n",
+ "\n",
+ " for fg in fgs:\n",
+ " target = fg.type\n",
+ " mol = Chem.MolFromSmarts(target)\n",
+ " clear_mapnum(mol)\n",
+ " sma = Chem.MolToSmarts(mol)\n",
+ " fg_smas.append(sma)\n",
+ "\n",
+ " return list(set(fg_smas))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fg_dict = {}\n",
+ "for idx, csmi in enumerate(loaded_train_df['cano_smi']):\n",
+ " m = Chem.MolFromSmiles(csmi)\n",
+ " fgs = extract_fgs(m)\n",
+ " for fg in fgs:\n",
+ " if fg in fg_dict:\n",
+ " fg_dict[fg] += 1\n",
+ " else:\n",
+ " fg_dict[fg] = 1\n",
+ "\n",
+ "for idx, csmi in enumerate(loaded_valid_df['cano_smi']):\n",
+ " m = Chem.MolFromSmiles(csmi)\n",
+ " fgs = extract_fgs(m)\n",
+ " for fg in fgs:\n",
+ " if fg in fg_dict:\n",
+ " fg_dict[fg] += 1\n",
+ " else:\n",
+ " fg_dict[fg] = 1\n",
+ " \n",
+ "for idx, csmi in enumerate(loaded_test_df['cano_smi']):\n",
+ " m = Chem.MolFromSmiles(csmi)\n",
+ " fgs = extract_fgs(m)\n",
+ " for fg in fgs:\n",
+ " if fg in fg_dict:\n",
+ " fg_dict[fg] += 1\n",
+ " else:\n",
+ " fg_dict[fg] = 1\n",
+ " \n",
+ "fg_count_from_mol = dict(sorted(fg_dict.items(), key=lambda item: item[1], reverse=True))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "col_nms = loaded_train_df.columns[2:52]\n",
+ "fg_count_from_df = {}\n",
+ "for cn in col_nms:\n",
+ " count = loaded_train_df[cn].sum() + loaded_valid_df[cn].sum() + loaded_test_df[cn].sum()\n",
+ " fg_count_from_df[cn] = count"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "functional group SMARTS / count in the dataframe\n",
+ "CO / 765\n",
+ "cOC / 523\n",
+ "COC(-,:C)=O / 502\n",
+ "cnc / 440\n",
+ "cCl / 436\n",
+ "cO / 349\n",
+ "CCl / 318\n",
+ "CC(-,:C)=O / 263\n",
+ "cBr / 230\n",
+ "c[N&+](=O)[O&-] / 245\n",
+ "cC(-,:C)=O / 235\n",
+ "C=CC / 237\n",
+ "cN / 224\n",
+ "cC(=O)OC / 210\n",
+ "COC / 205\n",
+ "CF / 198\n",
+ "CBr / 195\n",
+ "coc / 169\n",
+ "cF / 171\n",
+ "CC(=O)O / 145\n",
+ "c=O / 135\n",
+ "c[n&H1]c / 124\n",
+ "csc / 113\n",
+ "cC=O / 105\n",
+ "CNC / 110\n",
+ "CN / 105\n",
+ "CN(-,:C)C / 105\n",
+ "cC#N / 82\n",
+ "cn(-,:c)C / 93\n",
+ "cC(=O)O / 89\n",
+ "CC=C(-,:C)C / 86\n",
+ "CC#N / 86\n",
+ "cNC(-,:C)=O / 79\n",
+ "cNC / 76\n",
+ "C/C=C/C / 66\n",
+ "CC=CC / 77\n",
+ "C=C(-,:C)C / 71\n",
+ "C#CC / 71\n",
+ "cC(-,:c)=O / 70\n",
+ "cN(-,:C)C / 71\n",
+ "CC(=O)OC / 69\n",
+ "CC#CC / 67\n",
+ "cI / 62\n",
+ "CNC(-,:C)=O / 56\n",
+ "cC=Cc / 54\n",
+ "c-n(-,:c)c / 37\n",
+ "cnn(-,:c)C / 46\n",
+ "cnnc / 32\n",
+ "cP(-,:c)c / 44\n",
+ "CS / 39\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('functional group SMARTS / count in the dataframe')\n",
+ "for cn in col_nms:\n",
+ " print('{} / {}'.format(cn, fg_count_from_df[cn]))\n",
+ " if fg_count_from_df[cn] != fg_count_from_mol[cn]:\n",
+ " print('- - - - - {} has wrong count between dataframe and IFG extraction.'.format(cn))\n",
+ " print(cn)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "50"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(col_nms)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "chem-dl-ir",
+ "language": "python",
+ "name": "chem-dl-ir"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/training_spectrum_to_fgs/INSTALL.md b/training_spectrum_to_fgs/INSTALL.md
index b040cdd..e433e88 100644
--- a/training_spectrum_to_fgs/INSTALL.md
+++ b/training_spectrum_to_fgs/INSTALL.md
@@ -45,22 +45,19 @@ $ pip install numpy pandas matplotlib scikit-learn scipy Flask git+https://githu
$ python -m ipykernel install --user --name=deep-ir-01
```
-
# 2. Training and Validation
1. To get nist data, you should buy it from `https://www.nist.gov/srd/nist-standard-reference-database-35`, and put & rename the folder to `./data/nist/`.

-
2. run following files one-by-one
-
```
+00_explanation_nist_and_in_house_dataset.ipynb
01_load_data.ipynb
02_train_model.ipynb
03_verify_model_and_save_acc.ipynb
04_count_spectra.ipynb
```
-