|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "id": "43d1d18f-6770-4d51-acdc-b6abeefb20e1", |
| 6 | + "metadata": {}, |
| 7 | + "source": [ |
| 8 | + "## get list of CoRE MOF - CSD - unmodified" |
| 9 | + ] |
| 10 | + }, |
| 11 | + { |
| 12 | + "cell_type": "code", |
| 13 | + "execution_count": 14, |
| 14 | + "id": "f2ffddd5-373d-4569-ad30-6227c7fe746e", |
| 15 | + "metadata": {}, |
| 16 | + "outputs": [], |
| 17 | + "source": [ |
| 18 | + "import json" |
| 19 | + ] |
| 20 | + }, |
| 21 | + { |
| 22 | + "cell_type": "code", |
| 23 | + "execution_count": 16, |
| 24 | + "id": "0c5aff3d-56fc-47b6-8644-61dbfb9c67c5", |
| 25 | + "metadata": {}, |
| 26 | + "outputs": [], |
| 27 | + "source": [ |
| 28 | + "with open(\"./list_coremof_csd_unmodified_20250227.json\", \"r\") as f:\n", |
| 29 | + " csd_unmodified = json.load(f)" |
| 30 | + ] |
| 31 | + }, |
| 32 | + { |
| 33 | + "cell_type": "markdown", |
| 34 | + "id": "8c3447f3-9938-48d4-a8b8-939ac3ce7f40", |
| 35 | + "metadata": {}, |
| 36 | + "source": [ |
| 37 | + "### there are two subset (CR & NCR)\n", |
| 38 | + "\n", |
| 39 | + "```python\n", |
| 40 | + " CSD Unmodified Dataset # (N = 12,261)\n", |
| 41 | + " │\n", |
| 42 | + " ├── CR # computation-ready (N = 4,703)\n", |
| 43 | + " │ │\n", |
| 44 | + " │ ├── ASR # all solvent removed (N = 1,894)\n", |
| 45 | + " │ ├── FSR # free solvent removed (N = 2,657)\n", |
| 46 | + " │ └── Ion # with ion (N = 152)\n", |
| 47 | + " │\n", |
| 48 | + " └── NCR # not computation-ready (N = 7,558)\n" |
| 49 | + ] |
| 50 | + }, |
| 51 | + { |
| 52 | + "cell_type": "code", |
| 53 | + "execution_count": 36, |
| 54 | + "id": "55deb24d-8d13-441f-acd6-bca911ee75ae", |
| 55 | + "metadata": {}, |
| 56 | + "outputs": [ |
| 57 | + { |
| 58 | + "data": { |
| 59 | + "text/plain": [ |
| 60 | + "dict_keys(['CR', 'NCR'])" |
| 61 | + ] |
| 62 | + }, |
| 63 | + "execution_count": 36, |
| 64 | + "metadata": {}, |
| 65 | + "output_type": "execute_result" |
| 66 | + } |
| 67 | + ], |
| 68 | + "source": [ |
| 69 | + "csd_unmodified.keys()" |
| 70 | + ] |
| 71 | + }, |
| 72 | + { |
| 73 | + "cell_type": "code", |
| 74 | + "execution_count": 38, |
| 75 | + "id": "0936cfd5-bb33-4e14-9de4-30282bd7d40a", |
| 76 | + "metadata": {}, |
| 77 | + "outputs": [ |
| 78 | + { |
| 79 | + "data": { |
| 80 | + "text/plain": [ |
| 81 | + "dict_keys(['ASR', 'FSR', 'ION'])" |
| 82 | + ] |
| 83 | + }, |
| 84 | + "execution_count": 38, |
| 85 | + "metadata": {}, |
| 86 | + "output_type": "execute_result" |
| 87 | + } |
| 88 | + ], |
| 89 | + "source": [ |
| 90 | + "csd_unmodified[\"CR\"].keys()" |
| 91 | + ] |
| 92 | + }, |
| 93 | + { |
| 94 | + "cell_type": "markdown", |
| 95 | + "id": "1333bfb5-811e-480b-96e2-1b822449a2a6", |
| 96 | + "metadata": {}, |
| 97 | + "source": [ |
| 98 | + "**Note that the refcode in the CoRE MOF DB has an additional string ending such as “_ASR_pacman”.**" |
| 99 | + ] |
| 100 | + }, |
| 101 | + { |
| 102 | + "cell_type": "markdown", |
| 103 | + "id": "4a9fc947-3205-46d4-83a2-5fed8176d900", |
| 104 | + "metadata": {}, |
| 105 | + "source": [ |
| 106 | + "**For CR subset, there are CoRE MOF ID and REFCODE.**\n", |
| 107 | + "**For NCR subset, there is only REFCODE.**" |
| 108 | + ] |
| 109 | + }, |
| 110 | + { |
| 111 | + "cell_type": "code", |
| 112 | + "execution_count": 60, |
| 113 | + "id": "73643f82-05ca-4905-8699-1e717aa16f2c", |
| 114 | + "metadata": {}, |
| 115 | + "outputs": [ |
| 116 | + { |
| 117 | + "name": "stdout", |
| 118 | + "output_type": "stream", |
| 119 | + "text": [ |
| 120 | + "ABUXUT_ASR_pacman\n", |
| 121 | + "2016[Cu][nan]3[ASR]1\n" |
| 122 | + ] |
| 123 | + } |
| 124 | + ], |
| 125 | + "source": [ |
| 126 | + "# example for CR\n", |
| 127 | + "print(csd_unmodified[\"CR\"][\"ASR\"][1][0])\n", |
| 128 | + "print(csd_unmodified[\"CR\"][\"ASR\"][1][1])" |
| 129 | + ] |
| 130 | + }, |
| 131 | + { |
| 132 | + "cell_type": "code", |
| 133 | + "execution_count": 64, |
| 134 | + "id": "ec0e0fd1-e20f-4459-bcc0-3ddcca5cd9d8", |
| 135 | + "metadata": {}, |
| 136 | + "outputs": [ |
| 137 | + { |
| 138 | + "name": "stdout", |
| 139 | + "output_type": "stream", |
| 140 | + "text": [ |
| 141 | + "ABECIX_FSR_pacman\n" |
| 142 | + ] |
| 143 | + } |
| 144 | + ], |
| 145 | + "source": [ |
| 146 | + "# example for NCR\n", |
| 147 | + "print(csd_unmodified[\"NCR\"][0])" |
| 148 | + ] |
| 149 | + }, |
| 150 | + { |
| 151 | + "cell_type": "markdown", |
| 152 | + "id": "a847770f-4dac-4888-a469-f050e330d959", |
| 153 | + "metadata": {}, |
| 154 | + "source": [ |
| 155 | + "## download original CIFs from CSD\n", |
| 156 | + "you need install [*CSD python API*](https://downloads.ccdc.cam.ac.uk/documentation/API/installation_notes.html) and activate the licence first." |
| 157 | + ] |
| 158 | + }, |
| 159 | + { |
| 160 | + "cell_type": "markdown", |
| 161 | + "id": "ae5f4c8c-2266-4638-b47c-aeb3ad293f30", |
| 162 | + "metadata": {}, |
| 163 | + "source": [ |
| 164 | + "and install [CoREMOF_tools](https://coremof-tools.readthedocs.io/en/latest/index.html) by `pip install CoREMOF-tools`" |
| 165 | + ] |
| 166 | + }, |
| 167 | + { |
| 168 | + "cell_type": "code", |
| 169 | + "execution_count": 70, |
| 170 | + "id": "91208e59-7bf6-4812-89b6-13943ba35a99", |
| 171 | + "metadata": {}, |
| 172 | + "outputs": [], |
| 173 | + "source": [ |
| 174 | + "from CoREMOF.structure import download_from_CSD" |
| 175 | + ] |
| 176 | + }, |
| 177 | + { |
| 178 | + "cell_type": "code", |
| 179 | + "execution_count": 74, |
| 180 | + "id": "be948ea1-8a5e-4a72-be5e-7a8c0059043f", |
| 181 | + "metadata": {}, |
| 182 | + "outputs": [], |
| 183 | + "source": [ |
| 184 | + "### download ASR structure\n", |
| 185 | + "\n", |
| 186 | + "for refcodes in csd_unmodified[\"CR\"][\"ASR\"][:10]: # test for 10 structures\n", |
| 187 | + " refcode = refcodes[0].replace(\"_ASR_pacman\", \"\")\n", |
| 188 | + " download_from_CSD(refcode=refcode, output_folder=\"./structures/CR/ASR\")" |
| 189 | + ] |
| 190 | + }, |
| 191 | + { |
| 192 | + "cell_type": "code", |
| 193 | + "execution_count": 76, |
| 194 | + "id": "75e0c9d2-bdb2-424d-9158-29003a9fde16", |
| 195 | + "metadata": {}, |
| 196 | + "outputs": [], |
| 197 | + "source": [ |
| 198 | + "### download FSR structure\n", |
| 199 | + "\n", |
| 200 | + "for refcodes in csd_unmodified[\"CR\"][\"FSR\"][:10]: # test for 10 structures\n", |
| 201 | + " refcode = refcodes[0].replace(\"_FSR_pacman\", \"\")\n", |
| 202 | + " download_from_CSD(refcode=refcode, output_folder=\"./structures/CR/FSR\")" |
| 203 | + ] |
| 204 | + }, |
| 205 | + { |
| 206 | + "cell_type": "code", |
| 207 | + "execution_count": 78, |
| 208 | + "id": "a0399d5e-2a57-4ede-b95e-0ba263fd4ae6", |
| 209 | + "metadata": {}, |
| 210 | + "outputs": [], |
| 211 | + "source": [ |
| 212 | + "### download Ion structure\n", |
| 213 | + "\n", |
| 214 | + "for refcodes in csd_unmodified[\"CR\"][\"ION\"][:10]: # test for 10 structures\n", |
| 215 | + " refcode = refcodes[0].replace(\"_ion_pacman\", \"\")\n", |
| 216 | + " download_from_CSD(refcode=refcode, output_folder=\"./structures/CR/Ion\")" |
| 217 | + ] |
| 218 | + }, |
| 219 | + { |
| 220 | + "cell_type": "code", |
| 221 | + "execution_count": 86, |
| 222 | + "id": "691e839a-387b-44d9-88b2-2e27b44bee94", |
| 223 | + "metadata": {}, |
| 224 | + "outputs": [], |
| 225 | + "source": [ |
| 226 | + "### download NCR structure\n", |
| 227 | + "\n", |
| 228 | + "for refcode in csd_unmodified[\"NCR\"][:10]: # test for 10 structures\n", |
| 229 | + " refcode = refcode.split(\"_\")[0]\n", |
| 230 | + " download_from_CSD(refcode=refcode, output_folder=\"./structures/NCR/\")" |
| 231 | + ] |
| 232 | + }, |
| 233 | + { |
| 234 | + "cell_type": "markdown", |
| 235 | + "id": "61e48f31-6cc1-4388-9095-b88270351ec1", |
| 236 | + "metadata": {}, |
| 237 | + "source": [ |
| 238 | + "### process the structures" |
| 239 | + ] |
| 240 | + }, |
| 241 | + { |
| 242 | + "cell_type": "markdown", |
| 243 | + "id": "b450249b-421c-429d-9f81-8cffd0611c0e", |
| 244 | + "metadata": {}, |
| 245 | + "source": [ |
| 246 | + "since solvent removal is not required, only make primitive cell and make P1 are needed." |
| 247 | + ] |
| 248 | + }, |
| 249 | + { |
| 250 | + "cell_type": "code", |
| 251 | + "execution_count": 110, |
| 252 | + "id": "5a5af1da-b1a5-4ea7-ab76-9c6a074f4497", |
| 253 | + "metadata": {}, |
| 254 | + "outputs": [], |
| 255 | + "source": [ |
| 256 | + "from CoREMOF.structure import make_primitive_p1" |
| 257 | + ] |
| 258 | + }, |
| 259 | + { |
| 260 | + "cell_type": "code", |
| 261 | + "execution_count": 116, |
| 262 | + "id": "1abd657b-4771-49e4-9bef-9bd2f1cb97eb", |
| 263 | + "metadata": { |
| 264 | + "scrolled": true |
| 265 | + }, |
| 266 | + "outputs": [], |
| 267 | + "source": [ |
| 268 | + "structure_pri = make_primitive_p1(filename=\"./structures/CR/ASR/\"+csd_unmodified[\"CR\"][\"ASR\"][0][0].split(\"_\")[0]+\".cif\") # ABAVOP " |
| 269 | + ] |
| 270 | + }, |
| 271 | + { |
| 272 | + "cell_type": "markdown", |
| 273 | + "id": "d0120d89-9667-45ea-b71a-aa19d6ab90fc", |
| 274 | + "metadata": {}, |
| 275 | + "source": [ |
| 276 | + "predict partial atom charge by [PACMAN Charge](https://pubs.acs.org/doi/10.1021/acs.jctc.4c00434)\n", |
| 277 | + "install by `pip install pip install PACMAN-charge`" |
| 278 | + ] |
| 279 | + }, |
| 280 | + { |
| 281 | + "cell_type": "code", |
| 282 | + "execution_count": 18, |
| 283 | + "id": "001ba16b-c921-41bf-acb9-da36bdf6f8c6", |
| 284 | + "metadata": {}, |
| 285 | + "outputs": [ |
| 286 | + { |
| 287 | + "name": "stdout", |
| 288 | + "output_type": "stream", |
| 289 | + "text": [ |
| 290 | + "CIF Name: ./structures/CR/ASR/ABAVOP.cif\n", |
| 291 | + "Charge Type: DDEC6\n", |
| 292 | + "Digits: 10\n", |
| 293 | + "Atom Type: True\n", |
| 294 | + "Neutral: True\n", |
| 295 | + "Keep Connect: False\n", |
| 296 | + "Compelete and save as ./structures/CR/ASR/ABAVOP_pacman.cif\n" |
| 297 | + ] |
| 298 | + } |
| 299 | + ], |
| 300 | + "source": [ |
| 301 | + "from PACMANCharge import pmcharge\n", |
| 302 | + "pmcharge.predict(cif_file=\"./structures/CR/ASR/\"+csd_unmodified[\"CR\"][\"ASR\"][0][0].split(\"_\")[0]+\".cif\", # ABAVOP \n", |
| 303 | + " charge_type=\"DDEC6\",\n", |
| 304 | + " digits=10,\n", |
| 305 | + " atom_type=True,neutral=True,\n", |
| 306 | + " keep_connect=False)" |
| 307 | + ] |
| 308 | + }, |
| 309 | + { |
| 310 | + "cell_type": "markdown", |
| 311 | + "id": "a7c0a0af-4549-405b-96df-7b4d56525b51", |
| 312 | + "metadata": {}, |
| 313 | + "source": [ |
| 314 | + "**if you want to use CoRE MOF ID, you can change REFCODE to CoRE MOF ID**" |
| 315 | + ] |
| 316 | + }, |
| 317 | + { |
| 318 | + "cell_type": "code", |
| 319 | + "execution_count": 22, |
| 320 | + "id": "4117005e-abde-4d54-bc4b-62a38c0758dd", |
| 321 | + "metadata": {}, |
| 322 | + "outputs": [], |
| 323 | + "source": [ |
| 324 | + "import os" |
| 325 | + ] |
| 326 | + }, |
| 327 | + { |
| 328 | + "cell_type": "code", |
| 329 | + "execution_count": 26, |
| 330 | + "id": "3069a5d1-4b34-45c4-9e78-ba8f5468a4ea", |
| 331 | + "metadata": {}, |
| 332 | + "outputs": [], |
| 333 | + "source": [ |
| 334 | + "os.rename(\"./structures/CR/ASR/\"+csd_unmodified[\"CR\"][\"ASR\"][0][0].split(\"_\")[0]+\"_pacman.cif\", \"./structures/CR/ASR/\" + csd_unmodified[\"CR\"][\"ASR\"][0][1]+\".cif\") \n", |
| 335 | + "# ABAVOP_pacman.cif -> 2004[Co][rtl]3[ASR]2.cif" |
| 336 | + ] |
| 337 | + }, |
| 338 | + { |
| 339 | + "cell_type": "code", |
| 340 | + "execution_count": null, |
| 341 | + "id": "e294f9dd-d61e-4078-be4e-57b3c34b0846", |
| 342 | + "metadata": {}, |
| 343 | + "outputs": [], |
| 344 | + "source": [] |
| 345 | + } |
| 346 | + ], |
| 347 | + "metadata": { |
| 348 | + "kernelspec": { |
| 349 | + "display_name": "Python [conda env:base] *", |
| 350 | + "language": "python", |
| 351 | + "name": "conda-base-py" |
| 352 | + }, |
| 353 | + "language_info": { |
| 354 | + "codemirror_mode": { |
| 355 | + "name": "ipython", |
| 356 | + "version": 3 |
| 357 | + }, |
| 358 | + "file_extension": ".py", |
| 359 | + "mimetype": "text/x-python", |
| 360 | + "name": "python", |
| 361 | + "nbconvert_exporter": "python", |
| 362 | + "pygments_lexer": "ipython3", |
| 363 | + "version": "3.11.5" |
| 364 | + } |
| 365 | + }, |
| 366 | + "nbformat": 4, |
| 367 | + "nbformat_minor": 5 |
| 368 | +} |
0 commit comments