diff --git a/data/kg/chembl33_preprocessed_filtered_bioactivity_dataset_w_fullprotnames_smiles/meta.yaml b/data/kg/chembl33_preprocessed_filtered_bioactivity_dataset_w_fullprotnames_smiles/meta.yaml index 450197858..5982a8326 100644 --- a/data/kg/chembl33_preprocessed_filtered_bioactivity_dataset_w_fullprotnames_smiles/meta.yaml +++ b/data/kg/chembl33_preprocessed_filtered_bioactivity_dataset_w_fullprotnames_smiles/meta.yaml @@ -77,10 +77,10 @@ templates: Protein{# name|!}: {protein_name#} {#Molecule |!}{SMILES__description}: {SMILES#} Constraint{#s|!}: The {#resulting|derived|calculated!} {standard_type#} {#value |!}should be in {standard_units#}. Even if you are {#uncertain|not sure!}, you must {#derive|estimate|come up with!} a {standard_type#} {#value |!}without using any {#other|additional!} words. - Result: {standard_value#} {standard_units#} + Result: {standard_value#} {standard_units#} - |- Task: Please {#create|generate!} a {#molecule |!}{SMILES__description} that has a {#bioaffinity|affinity!} to {#the protein |!}{protein_name#} with a {standard_type#} {#value |!}of {standard_value#} {standard_units#}. - Result: {SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: What is the {#the bioaffinity|the affinity!} of a {#molecule to a protein|protein to a molecule!}? @@ -98,4 +98,4 @@ templates: Constraint: The {#shown|listed!} {standard_type#} values {#below |!}are in {standard_units#}. Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%3-5%aA1} without using any other words. Options: {standard_value%} - Answer: {%multiple_choice_result} + Answer: {%multiple_choice_result} diff --git a/data/kg/compound_chebi/meta.yaml b/data/kg/compound_chebi/meta.yaml index a3784a4f0..224145fac 100644 --- a/data/kg/compound_chebi/meta.yaml +++ b/data/kg/compound_chebi/meta.yaml @@ -61,4 +61,4 @@ templates: Result: {SMILES#} - |- Task: Please {#create|generate!} {#a compound |a !}{SMILES__description} that {rel1_type#} {node2_name#}. - Result: {SMILES#} + Result: {SMILES#} diff --git a/data/tabular/MUV_466/meta.yaml b/data/tabular/MUV_466/meta.yaml index c3d47e042..7881b4287 100644 --- a/data/tabular/MUV_466/meta.yaml +++ b/data/tabular/MUV_466/meta.yaml @@ -31,3 +31,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-466#not &NULL}{MUV-466__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-466#not &NULL}{MUV-466__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-466__names__noun}? + Answer: {MUV-466#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-466__names__noun}. + Result: {MUV-466#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-466#not &NULL}{MUV-466__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_548/meta.yaml b/data/tabular/MUV_548/meta.yaml index 6092be5a3..d11385c88 100644 --- a/data/tabular/MUV_548/meta.yaml +++ b/data/tabular/MUV_548/meta.yaml @@ -33,3 +33,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-548#not &NULL}{MUV-548__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-548#not &NULL}{MUV-548__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-548__names__noun}? + Answer: {MUV-548#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-548__names__noun}. + Result: {MUV-548#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-548#not &NULL}{MUV-548__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_600/meta.yaml b/data/tabular/MUV_600/meta.yaml index cebb65352..26c73c2a5 100644 --- a/data/tabular/MUV_600/meta.yaml +++ b/data/tabular/MUV_600/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-600#not &NULL}{MUV-600__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-600#not &NULL}{MUV-600__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-600__names__noun}? + Answer: {MUV-600#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-600__names__noun}. + Result: {MUV-600#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-600#not &NULL}{MUV-600__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_644/meta.yaml b/data/tabular/MUV_644/meta.yaml index 43a60930d..ee39d5992 100644 --- a/data/tabular/MUV_644/meta.yaml +++ b/data/tabular/MUV_644/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-644#not &NULL}{MUV-644__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-644#not &NULL}{MUV-644__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-644__names__noun}? + Answer: {MUV-644#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-644__names__noun}. + Result: {MUV-644#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-644#not &NULL}{MUV-644__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_652/meta.yaml b/data/tabular/MUV_652/meta.yaml index da14f996c..02307adac 100644 --- a/data/tabular/MUV_652/meta.yaml +++ b/data/tabular/MUV_652/meta.yaml @@ -31,3 +31,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-652#not &NULL}{MUV-652__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-652#not &NULL}{MUV-652__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-652__names__noun}? + Answer: {MUV-652#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-652__names__noun}. + Result: {MUV-652#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-652#not &NULL}{MUV-652__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_689/meta.yaml b/data/tabular/MUV_689/meta.yaml index 780715e39..38170e3c3 100644 --- a/data/tabular/MUV_689/meta.yaml +++ b/data/tabular/MUV_689/meta.yaml @@ -31,3 +31,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-689#not &NULL}{MUV-689__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-689#not &NULL}{MUV-689__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-689__names__noun}? + Answer: {MUV-689#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-689__names__noun}. + Result: {MUV-689#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-689#not &NULL}{MUV-689__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_692/meta.yaml b/data/tabular/MUV_692/meta.yaml index 62400e402..998faecd7 100644 --- a/data/tabular/MUV_692/meta.yaml +++ b/data/tabular/MUV_692/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-692#not &NULL}{MUV-692__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-692#not &NULL}{MUV-692__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-692__names__noun}? + Answer: {MUV-692#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-692__names__noun}. + Result: {MUV-692#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-692#not &NULL}{MUV-692__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_712/meta.yaml b/data/tabular/MUV_712/meta.yaml index 977d3feb5..4e287d65b 100644 --- a/data/tabular/MUV_712/meta.yaml +++ b/data/tabular/MUV_712/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-712#not &NULL}{MUV-712__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-712#not &NULL}{MUV-712__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-712__names__noun}? + Answer: {MUV-712#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-712__names__noun}. + Result: {MUV-712#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-712#not &NULL}{MUV-712__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_713/meta.yaml b/data/tabular/MUV_713/meta.yaml index a5623ea17..74c1275a2 100644 --- a/data/tabular/MUV_713/meta.yaml +++ b/data/tabular/MUV_713/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-713#not &NULL}{MUV-713__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-713#not &NULL}{MUV-713__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-713__names__noun}? + Answer: {MUV-713#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-713__names__noun}. + Result: {MUV-713#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-713#not &NULL}{MUV-713__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_733/meta.yaml b/data/tabular/MUV_733/meta.yaml index 7acc2925d..b143660ce 100644 --- a/data/tabular/MUV_733/meta.yaml +++ b/data/tabular/MUV_733/meta.yaml @@ -31,3 +31,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-733#not &NULL}{MUV-733__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-733#not &NULL}{MUV-733__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-733__names__noun}? + Answer: {MUV-733#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-733__names__noun}. + Result: {MUV-733#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-733#not &NULL}{MUV-733__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_737/meta.yaml b/data/tabular/MUV_737/meta.yaml index 69c56e3ed..2880b2a8e 100644 --- a/data/tabular/MUV_737/meta.yaml +++ b/data/tabular/MUV_737/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-737#not &NULL}{MUV-737__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-737#not &NULL}{MUV-737__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-737__names__noun}? + Answer: {MUV-737#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-737__names__noun}. + Result: {MUV-737#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-737#not &NULL}{MUV-737__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_810/meta.yaml b/data/tabular/MUV_810/meta.yaml index 74c9f1907..0cc5dae58 100644 --- a/data/tabular/MUV_810/meta.yaml +++ b/data/tabular/MUV_810/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-810#not &NULL}{MUV-810__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-810#not &NULL}{MUV-810__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-810__names__noun}? + Answer: {MUV-810#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-810__names__noun}. + Result: {MUV-810#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-810#not &NULL}{MUV-810__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_832/meta.yaml b/data/tabular/MUV_832/meta.yaml index 16ad3978a..9e2606cd5 100644 --- a/data/tabular/MUV_832/meta.yaml +++ b/data/tabular/MUV_832/meta.yaml @@ -31,3 +31,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-832#not &NULL}{MUV-832__names__noun}. + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-832#not &NULL}{MUV-832__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-832__names__noun}? + Answer: {MUV-832#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-832__names__noun}. + Result: {MUV-832#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-832#not &NULL}{MUV-832__names__noun} and report its {SMILES__description}. + Result: {SMILES#} \ No newline at end of file diff --git a/data/tabular/MUV_846/meta.yaml b/data/tabular/MUV_846/meta.yaml index 81f19bd92..9d15e33d8 100644 --- a/data/tabular/MUV_846/meta.yaml +++ b/data/tabular/MUV_846/meta.yaml @@ -33,17 +33,13 @@ templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-846#not &NULL}{MUV-846__names__noun}. - |- Question: Is the {SMILES__description} {SMILES#} {MUV-846__names__noun}? - - Answer:{MUV-846#no&yes} + Answer:{MUV-846#no&yes} - |- Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-846__names__noun}. - - Result:{MUV-846#no&yes} + Result: {MUV-846#no&yes} - |- Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-846#not &NULL}{MUV-846__names__noun} and report its {SMILES__description}. - - Result:{SMILES#} + Result: {SMILES#} - |- {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-846#not &NULL}{MUV-846__names__noun} and report its {SMILES__description}. - Result: {SMILES#} diff --git a/data/tabular/MUV_852/meta.yaml b/data/tabular/MUV_852/meta.yaml index a77ec1be0..172f1c18c 100644 --- a/data/tabular/MUV_852/meta.yaml +++ b/data/tabular/MUV_852/meta.yaml @@ -31,3 +31,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-852#not &NULL}{MUV-852__names__noun}. + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-852__names__noun}? + Answer:{MUV-852#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-852__names__noun}. + Result: {MUV-852#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-852#not &NULL}{MUV-852__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-852#not &NULL}{MUV-852__names__noun} and report its {SMILES__description}. + Result: {SMILES#} diff --git a/data/tabular/MUV_858/meta.yaml b/data/tabular/MUV_858/meta.yaml index 0690988e8..d90a12ca6 100644 --- a/data/tabular/MUV_858/meta.yaml +++ b/data/tabular/MUV_858/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-858#not &NULL}{MUV-858__names__noun}. + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-858__names__noun}? + Answer:{MUV-858#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-858__names__noun}. + Result: {MUV-858#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-858#not &NULL}{MUV-858__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-858#not &NULL}{MUV-858__names__noun} and report its {SMILES__description}. + Result: {SMILES#} diff --git a/data/tabular/MUV_859/meta.yaml b/data/tabular/MUV_859/meta.yaml index 2bbecfc47..edfe29847 100644 --- a/data/tabular/MUV_859/meta.yaml +++ b/data/tabular/MUV_859/meta.yaml @@ -32,3 +32,15 @@ bibtex: URL = {https://doi.org/10.1021/ci8002649}} templates: - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-859#not &NULL}{MUV-859__names__noun}. + - |- + Question: Is the {SMILES__description} {SMILES#} {MUV-859__names__noun}? + Answer:{MUV-859#no&yes} + - |- + Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-859__names__noun}. + Result: {MUV-859#no&yes} + - |- + Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-859#not &NULL}{MUV-859__names__noun} and report its {SMILES__description}. + Result: {SMILES#} + - |- + {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-859#not &NULL}{MUV-859__names__noun} and report its {SMILES__description}. + Result: {SMILES#} diff --git a/data/tabular/RedDB/transform.py b/data/tabular/RedDB/transform.py index cdf145cf6..5cc798f02 100644 --- a/data/tabular/RedDB/transform.py +++ b/data/tabular/RedDB/transform.py @@ -280,8 +280,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/SIDER/transform.py b/data/tabular/SIDER/transform.py index 0f4494872..70a2d540f 100644 --- a/data/tabular/SIDER/transform.py +++ b/data/tabular/SIDER/transform.py @@ -138,8 +138,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/ames_mutagenicity/meta.yaml b/data/tabular/ames_mutagenicity/meta.yaml index 654103ef2..b68541a12 100644 --- a/data/tabular/ames_mutagenicity/meta.yaml +++ b/data/tabular/ames_mutagenicity/meta.yaml @@ -93,13 +93,13 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {mutagenic#not &NULL}be {mutagenic__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {mutagenic#not &NULL}{mutagenic__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {mutagenic__names__adjective}:{mutagenic#no&yes} + - Is the {SMILES__description} {SMILES#} {mutagenic__names__adjective}:{mutagenic#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {mutagenic__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{mutagenic#False&True} + Result: {mutagenic#False&True} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {mutagenic__names__adjective}? @@ -113,7 +113,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {mutagenic%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {mutagenic#not &NULL}{mutagenic__names__adjective}? @@ -127,4 +128,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%mutagenic%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/ames_mutagenicity/transform.py b/data/tabular/ames_mutagenicity/transform.py index 1d0949254..669b628fd 100644 --- a/data/tabular/ames_mutagenicity/transform.py +++ b/data/tabular/ames_mutagenicity/transform.py @@ -53,165 +53,6 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "ames_mutagenicity", # unique identifier, we will also use this for directory names - "description": """Mutagenicity means the ability of a drug to induce genetic alterations. -Drugs that can cause damage to the DNA can result in cell death or other severe -adverse effects. Nowadays, the most widely used assay for testing the mutagenicity -of compounds is the Ames experiment which was invented by a professor named -Ames. The Ames test is a short term bacterial reverse mutation assay detecting -a large number of compounds which can induce genetic damage and frameshift mutations. -The dataset is aggregated from four papers.""", - "targets": [ - { - "id": "mutagenic", # name of the column in a tabular dataset - "description": "whether it is mutagenic (1) or not mutagenic (0)", - "units": None, # units of the values in this column (leave empty if unitless) - "type": "boolean", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "mutagenicity"}, - {"noun": "Ames mutagenicity"}, - {"adjective": "mutagenic"}, - {"adjective": "Ames mutagenic"}, - {"verb": "has the ability to induce genetic alterations"}, - {"gerund": "having the potential to cause mutations"}, - {"gerund": "having the potential to induce genetic alterations"}, - ], - }, - ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1021/ci300400a", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/tox/#ames-mutagenicity", - "description": "Data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Xu2012, -doi = {10.1021/ci300400a}, -url = {https://doi.org/10.1021/ci300400a}, -year = {2012}, -month = oct, -publisher = {American Chemical Society (ACS)}, -volume = {52}, -number = {11}, -pages = {2840--2847}, -author = {Congying Xu and Feixiong Cheng and Lei Chen and -Zheng Du and Weihua Li and Guixia Liu and Philip W. Lee and Yun Tang}, -title = {In silico Prediction of Chemical Ames Mutagenicity}, -journal = {Journal of Chemical Information and Modeling}""", - ], - "templates": [ - "The molecule with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} {mutagenic#no &NULL}{mutagenic__names__adjective} properties.", # noqa: E501 - "Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule has {mutagenic#no &NULL}{mutagenic__names__adjective} {#properties|characteristics|features!}.", # noqa: E501 - "The {SMILES__description} {SMILES#} represents a molecule that is {mutagenic#not &NULL}identified as {mutagenic__names__adjective}.", # noqa: E501 - "The {#molecule |!}{SMILES__description} {SMILES#} is {mutagenic#not &NULL}{mutagenic__names__adjective}.", # noqa: E501 not all variables need to be used - # Instruction tuning text templates - """Task: Please classify a molecule based on the description. -Description: A molecule that is {mutagenic__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result: {mutagenic#False&True}""", # noqa: E501 - """Task: Please classify a molecule based on the description. -Description: A molecule that is {mutagenic__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Answer the question in a {#full|complete!} sentence. -Result: This molecule is {mutagenic#not &NULL}{mutagenic__names__adjective}.""", - """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. -Description: A molecule that is {mutagenic#not &NULL}{mutagenic__names__adjective}. -Result: {SMILES#}""", # noqa: E501 - # Conversational text templates - """User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} is {mutagenic__names__adjective}? -Assistant: {mutagenic#No&Yes}, this molecule is {mutagenic#not &NULL}{mutagenic__names__adjective}.""", # noqa: E501 - """User: Is the molecule with the {SMILES__description} {SMILES#} {mutagenic__names__adjective}? -Assistant: {mutagenic#No&Yes}, it is {mutagenic#not &NULL}{mutagenic__names__adjective}.""", # noqa: E501 - """User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that is {mutagenic#not &NULL}{mutagenic__names__adjective}? -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#}""", # noqa: E501 - """User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that is {mutagenic#not &NULL}{mutagenic__names__adjective}? -Assistant: This is a molecule that is {mutagenic#not &NULL}{mutagenic__names__adjective}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? -User: Yes, please. The molecule should {mutagenic#not &NULL}be {mutagenic__names__adjective}. -Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} is {mutagenic#not &NULL}{mutagenic__names__adjective}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? -User: Yes, the molecule should {mutagenic#not &NULL}be {mutagenic__names__adjective}. -Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {mutagenic#not &NULL}{mutagenic__names__adjective}: {SMILES#}""", # noqa: E501 - # Benchmarking text templates - "Is the {SMILES__description} {SMILES#} {mutagenic__names__adjective}:{mutagenic#no&yes}", # noqa: E501 for the benchmarking setup separates input and output - """Task: Please classify a molecule based on the description. -Description: A molecule that is {mutagenic__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result:{mutagenic#False&True}""", # noqa: E501 - # noqa: E501 """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. - # Description: A molecule that is {mutagenic__names__adjective}. - # Result:{SMILES#}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {mutagenic__names__adjective}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{mutagenic%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {mutagenic__names__adjective}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{mutagenic%} -Answer:{%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {mutagenic#not &NULL}{mutagenic__names__adjective}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%mutagenic%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {mutagenic#not &NULL}{mutagenic__names__adjective}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%mutagenic%} -Answer:{%multiple_choice_result}""", # noqa: E501 - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/bc5chem/meta.yaml b/data/tabular/bc5chem/meta.yaml index cd7b301ba..adde3a1ad 100644 --- a/data/tabular/bc5chem/meta.yaml +++ b/data/tabular/bc5chem/meta.yaml @@ -49,6 +49,22 @@ templates: {#Sentence|Description!}: {sentence#} Answer: {matched_words#} - |- - User: Does the following text contain mentions of {#chemicals|chemical compounds|chemical substances!}?{# Can you return matches?| Can you output matches?| Please return matches.!} + User: Does the following text contain mentions of {#chemicals|chemical compounds|chemical substances!}?{# Can you return matches?| Can you output matches?|Please return matches!} {#Text: |!}{sentence#} Assistant: {#I found|There is!} {matched_words#}. + - |- + Task: {#Extract|Identify!} the chemical mentions in the given {sentence__names__noun}. List the chemicals found. If there is no {#match|mention of a chemical|matching entity!}, return `no match`. + + {#Sentence: |Text: |!}{sentence#} + + Answer: {matched_words#} + - |- + Task: {#Extract|Identify!} the chemical substances from the {sentence__names__noun} below. If there is no {#match|mention of a chemical|matching entity!}, return `no match`. + + {#Sentence: |Text: |!}{sentence#} + + Answer: {matched_words#} + - |- + Task: {#Please identify|Identify!} the chemical compounds in the {sentence__names__noun} provided. If there is no {#match|mention of a chemical|matching entity!}, return `no match`. + {#Sentence: |Text: |!}{sentence#} + Answer: {matched_words#} diff --git a/data/tabular/bc5disease/meta.yaml b/data/tabular/bc5disease/meta.yaml index 9a3928dda..320a5b506 100644 --- a/data/tabular/bc5disease/meta.yaml +++ b/data/tabular/bc5disease/meta.yaml @@ -63,10 +63,8 @@ templates: {#Sentence: |Text: |!}{sentence#} - Answer:{matched_words#} + Answer: {matched_words#} - |- Task: {#Please identify|Identify!} the diseases in the {sentence__names__noun} provided. If there is no {#match|mention of a disease|matching entity!}, return `no match`. - {#Sentence: |Text: |!}{sentence#} - - Answer:{matched_words#} + Answer: {matched_words#} diff --git a/data/tabular/bicerano_dataset/meta.yaml b/data/tabular/bicerano_dataset/meta.yaml index cebf2919e..68cc9e37f 100644 --- a/data/tabular/bicerano_dataset/meta.yaml +++ b/data/tabular/bicerano_dataset/meta.yaml @@ -51,18 +51,18 @@ templates: - The polymer with the {compound_name__names__noun} of {compound_name#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units}. - The polymer with the {compound_name__names__noun} of {compound_name#} has a {Tg_calc__names__noun} of {Tg_calc#} {Tg_calc__units}. - The polymer with the {compound_name__names__noun} of {compound_name#} has a {rho_300K_calc__names__noun} of {rho_300K_calc#} {rho_300K_calc__units}. - - What is the {Tg_exp__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{Tg_exp#} {Tg_exp__units}. - - What is the {Tg_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{Tg_calc#} {Tg_calc__units}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{rho_300K_calc#} {rho_300K_calc__units}. - - What is the {Tg_exp__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{Tg_exp#} {Tg_exp__units}. - - What is the {Tg_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{Tg_calc#} {Tg_calc__units}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{rho_300K_calc#} {rho_300K_calc__units}. + - What is the {Tg_exp__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{Tg_exp#} {Tg_exp__units}. + - What is the {Tg_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{Tg_calc#} {Tg_calc__units}. + - What is the {rho_300K_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{rho_300K_calc#} {rho_300K_calc__units}. + - What is the {Tg_exp__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{Tg_exp#} {Tg_exp__units}. + - What is the {Tg_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{Tg_calc#} {Tg_calc__units}. + - What is the {rho_300K_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{rho_300K_calc#} {rho_300K_calc__units}. - The polymer with the {PSMILES__description} {PSMILES#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units} and a {Tg_calc__names__noun} of {Tg_calc#} {Tg_calc__units}. - The polymer with the {compound_name__names__noun} {compound_name#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units} and a {Tg_calc__names__noun} of {Tg_calc#} {Tg_calc__units}. - - Compare the {Tg_exp__names__noun} and {Tg_calc__names__noun} for the polymer with the {PSMILES__description} {PSMILES#}. Answer:{Tg_exp#} {Tg_exp__units}, {Tg_calc#} {Tg_calc__units}. - - Compare the {Tg_exp__names__noun} and {Tg_calc__names__noun} for the polymer with the {compound_name__names__noun} {compound_name#}. Answer:{Tg_exp#} {Tg_exp__units}, {Tg_calc#} {Tg_calc__units}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#} at 300K? Answer:{rho_300K_calc#} {rho_300K_calc__units}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#} at 300K? Answer:{rho_300K_calc#} {rho_300K_calc__units}. + - Compare the {Tg_exp__names__noun} and {Tg_calc__names__noun} for the polymer with the {PSMILES__description} {PSMILES#}. Answer:{Tg_exp#} {Tg_exp__units}, {Tg_calc#} {Tg_calc__units}. + - Compare the {Tg_exp__names__noun} and {Tg_calc__names__noun} for the polymer with the {compound_name__names__noun} {compound_name#}. Answer:{Tg_exp#} {Tg_exp__units}, {Tg_calc#} {Tg_calc__units}. + - What is the {rho_300K_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#} at 300K? Answer:{rho_300K_calc#} {rho_300K_calc__units}. + - What is the {rho_300K_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#} at 300K? Answer:{rho_300K_calc#} {rho_300K_calc__units}. - |- Task: Please answer the multiple choice question. @@ -73,7 +73,8 @@ templates: Options: {Tg_exp%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. @@ -85,7 +86,8 @@ templates: Options: {Tg_calc%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. @@ -96,7 +98,8 @@ templates: Options: {rho_300K_calc%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. @@ -107,7 +110,8 @@ templates: Options: {Tg_exp%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. @@ -119,4 +123,5 @@ templates: Options: {Tg_calc%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/bio_ner/meta.yaml b/data/tabular/bio_ner/meta.yaml index 8566a1816..761ddd376 100644 --- a/data/tabular/bio_ner/meta.yaml +++ b/data/tabular/bio_ner/meta.yaml @@ -1,36 +1,36 @@ name: bio_ner -description: NER task on bio-related text. +description: NER data. identifiers: - - id: Sentence - description: Sentence - type: Other +- id: Sentence + description: Sentence + type: Other targets: - - id: entity_1 - description: entity_1 - type: Other - units: entity_1 - names: - - noun: entity_1 - - id: json - description: json - type: Other - units: - names: - - noun: JSON output +- id: entity_1 + description: entity_1 + type: Other + units: entity_1 + names: + - noun: entity_1 +- id: json + description: json + type: Other + units: null + names: + - noun: JSON output benchmarks: - - name: bio_ner - link: https://github.com/ML4LitS/bio-datasets - split_column: split -license: unknown +- name: ??? + link: ??? + split_column: split +license: NEEDS TO BE DEFINED links: - - url: https://github.com/ML4LitS/bio-datasets - description: ??? +- url: https://github.com/ML4LitS/bio-datasets + description: ??? num_points: 123509 bibtex: - - ??? +- ??? templates: - - |- - Task: Please carry out the {#named entity recognition (NER)|named entity recognition|NER!} task for the the text below. - Text: {Sentence#}. - Constrain: Please, {#only |!}list the entities in the form NER entity, span start, span end, and type {#in separate lines |!}with a high probability of being in the text. - Result: {entity_1#} +- |- + Task: Please carry out the {#named entity recognition (NER)|named entity recognition|NER!} task for the the text below. + Text: {Sentence#}. + Constrain: Please, {#only |!}list the entities in the form NER entity, span start, span end, and type {#in separate lines |!}with a high probability of being in the text. + Result: {entity_1#} diff --git a/data/tabular/bioavailability_ma_et_al/meta.yaml b/data/tabular/bioavailability_ma_et_al/meta.yaml index e5d7f417a..d69078b78 100644 --- a/data/tabular/bioavailability_ma_et_al/meta.yaml +++ b/data/tabular/bioavailability_ma_et_al/meta.yaml @@ -4,128 +4,132 @@ description: |- active ingredient or active moiety is absorbed from a drug product and becomes available at the site of action. targets: - - id: bioavailable - description: whether it is bioavailable (1) or not (0) - units: - type: boolean - names: - - noun: oral bioavailability - - adjective: orally bioavailable - uris: - - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C70913 +- id: bioavailable + description: whether it is bioavailable (1) or not (0) + units: null + type: boolean + names: + - noun: oral bioavailability + - adjective: orally bioavailable + uris: + - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C70913 benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: SMILES - type: SMILES - description: SMILES - - id: compound_name - type: Other - names: - - noun: compound name - - noun: drug name - - noun: generic drug name - description: drug name +- id: SMILES + type: SMILES + description: SMILES +- id: compound_name + type: Other + names: + - noun: compound name + - noun: drug name + - noun: generic drug name + description: drug name license: CC BY 4.0 links: - - url: https://doi.org/10.1016/j.jpba.2008.03.023 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/adme/#bioavailability-ma-et-al - description: data source +- url: https://doi.org/10.1016/j.jpba.2008.03.023 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/adme/#bioavailability-ma-et-al + description: data source num_points: 640 bibtex: - - |- - @article{Ma2008, - doi = {10.1016/j.jpba.2008.03.023}, - url = {https://doi.org/10.1016/j.jpba.2008.03.023}, - year = {2008}, - month = aug, - publisher = {Elsevier BV}, - volume = {47}, - number = {4-5}, - author = {Chang-Ying Ma and Sheng-Yong Yang and Hui Zhang - and Ming-Li Xiang and Qi Huang and Yu-Quan Wei}, - title = {Prediction models of human plasma protein binding rate and - oral bioavailability derived by using GA-CG-SVM method}, - journal = {Journal of Pharmaceutical and Biomedical Analysis} +- |- + @article{Ma2008, + doi = {10.1016/j.jpba.2008.03.023}, + url = {https://doi.org/10.1016/j.jpba.2008.03.023}, + year = {2008}, + month = aug, + publisher = {Elsevier BV}, + volume = {47}, + number = {4-5}, + author = {Chang-Ying Ma and Sheng-Yong Yang and Hui Zhang + and Ming-Li Xiang and Qi Huang and Yu-Quan Wei}, + title = {Prediction models of human plasma protein binding rate and + oral bioavailability derived by using GA-CG-SVM method}, + journal = {Journal of Pharmaceutical and Biomedical Analysis} templates: - - The molecule with the {SMILES__description} {#representation of |!}{SMILES#} has a {bioavailable#low&high} {bioavailable__names__noun}. - - Based on the {SMILES__description} {#representation of |!}{SMILES#}, the molecule has a {bioavailable#low&high} {bioavailable__names__noun}. - - The {SMILES__description} {SMILES#} {#represents|is from!} a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}. - - The {SMILES__description} {SMILES#} has a {bioavailable#low&high} {bioavailable__names__noun}. - - The molecule with the {SMILES__description} {SMILES#} has a {bioavailable#low&high} {bioavailable__names__noun}. - - |- - Task: Please classify a molecule based on the description. - Description: Predict if the molecule has a low or high {bioavailable__names__noun}? - {#Molecule |!}{SMILES__description}: {SMILES#} - Constraint: Even if you are {#uncertain|not sure!}, you must pick either "low" or "high" without using any {#other|additional!} words. - Result: {bioavailable#low&high} - - |- - Task: Please classify a molecule based on the description. - Description: Predict if the molecule has a low or high {bioavailable__names__noun}? - {#Molecule |!}{SMILES__description}: {SMILES#} - Constraint: Answer the question in a {#full|complete!} sentence. - Result: This molecule has a {bioavailable#low&high} {bioavailable__names__noun}. - - |- - Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. - Description: A molecule that has a {bioavailable#low&high} {bioavailable__names__noun}. - Result: {SMILES#} - - |- - User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} has a low or high {bioavailable__names__noun}? - Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, this molecule has a {bioavailable#low&high} {bioavailable__names__noun}. - - |- - User: Has the molecule with the {SMILES__description} {SMILES#} a low or high {bioavailable__names__noun}? - Assistant: It has a {bioavailable#low&high} {bioavailable__names__noun}. - - |- - User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}? - Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} - - |- - User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}? - Assistant: {#Ok, this|This!} is a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#} - - |- - User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. - Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? - User: Yes, please. The molecule should have a {bioavailable#low&high} {bioavailable__names__noun}. - Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#} - - |- - User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. - Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? - User: Yes, the molecule should have a {bioavailable#low&high} {bioavailable__names__noun}. - Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {bioavailable__names__adjective}?{bioavailable#yes&no} - - |- - Task: Please classify a molecule based on the description. - Description: Predict if the molecule has a low or high {bioavailable__names__noun}? - {#Molecule |!}{SMILES__description}: {SMILES#} - Constraint: Even if you are {#uncertain|not sure!}, you must pick either "low" or "high" without using any {#other|additional!} words. - Result:{bioavailable#low&high} - - |- - Task: Please answer the multiple choice question. - Question: Has the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a high {bioavailable__names__noun}? - Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. - Options: - {bioavailable%} - Answer: {%multiple_choice_result} - - |- - Task: Please answer the multiple choice question. - Question: Has the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a high {bioavailable__names__noun}? - Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. - Options: - {bioavailable%} - Answer:{%multiple_choice_result} - - |- - Task: Please answer the multiple choice question. - Question: Which molecules have a high {bioavailable__names__noun}? - Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. - Options: - {SMILES%bioavailable%} - Answer: {%multiple_choice_result} - - |- - Task: Please answer the multiple choice question. - Question: Which molecules have a high {bioavailable__names__noun}? - Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. - Options: - {SMILES%bioavailable%} - Answer:{%multiple_choice_result} +- The molecule with the {SMILES__description} {#representation of |!}{SMILES#} has + a {bioavailable#low&high} {bioavailable__names__noun}. +- Based on the {SMILES__description} {#representation of |!}{SMILES#}, the molecule + has a {bioavailable#low&high} {bioavailable__names__noun}. +- The {SMILES__description} {SMILES#} represents a molecule that has a {bioavailable#low&high} + {bioavailable__names__noun}. +- The {SMILES__description} {SMILES#} has a {bioavailable#low&high} {bioavailable__names__noun}. +- The molecule with the {SMILES__description} {SMILES#} has a {bioavailable#low&high} + {bioavailable__names__noun}. +- |- + Task: Please classify a molecule based on the description. + Description: Predict if the molecule has a low or high {bioavailable__names__noun}? + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Even if you are {#uncertain|not sure!}, you must pick either "low" or "high" without using any {#other|additional!} words. + Result: {bioavailable#low&high} +- |- + Task: Please classify a molecule based on the description. + Description: Predict if the molecule has a low or high {bioavailable__names__noun}? + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Answer the question in a {#full|complete!} sentence. + Result: This molecule has a {bioavailable#low&high} {bioavailable__names__noun}. +- |- + Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. + Description: A molecule that has a {bioavailable#low&high} {bioavailable__names__noun}. + Result: {SMILES#} +- |- + User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} has a low or high {bioavailable__names__noun}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, this molecule has a {bioavailable#low&high} {bioavailable__names__noun}. +- |- + User: Has the molecule with the {SMILES__description} {SMILES#} a low or high {bioavailable__names__noun}? + Assistant: It has a {bioavailable#low&high} {bioavailable__names__noun}. +- |- + User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} +- |- + User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}? + Assistant: {#Ok, this|This!} is a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#} +- |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? + User: Yes, please. The molecule should have a {bioavailable#low&high} {bioavailable__names__noun}. + Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#} +- |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? + User: Yes, the molecule should have a {bioavailable#low&high} {bioavailable__names__noun}. + Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#} +- Is the {SMILES__description} {SMILES#} {bioavailable__names__adjective}? {bioavailable#no&yes} +- |- + Task: Please classify a molecule based on the description. + Description: Predict if the molecule has a low or high {bioavailable__names__noun}? + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Even if you are {#uncertain|not sure!}, you must pick either "low" or "high" without using any {#other|additional!} words. + Result: {bioavailable#low&high} +- |- + Task: Please answer the multiple choice question. + Question: Has the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a high {bioavailable__names__noun}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. + Options: + {bioavailable%} + Answer: {%multiple_choice_result} +- |- + Task: Please answer the multiple choice question. + Question: Has the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a high {bioavailable__names__noun}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. + Options: + {bioavailable%} + Answer: {%multiple_choice_result} +- |- + Task: Please answer the multiple choice question. + Question: Which molecules have a high {bioavailable__names__noun}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%bioavailable%} + Answer: {%multiple_choice_result} +- |- + Task: Please answer the multiple choice question. + Question: Which molecules have a high {bioavailable__names__noun}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%bioavailable%} + Answer: {%multiple_choice_result} diff --git a/data/tabular/bioavailability_ma_et_al/transform.py b/data/tabular/bioavailability_ma_et_al/transform.py index ac9db02bd..5ee1bf17f 100644 --- a/data/tabular/bioavailability_ma_et_al/transform.py +++ b/data/tabular/bioavailability_ma_et_al/transform.py @@ -44,172 +44,6 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "bioavailability_ma_et_al", # unique identifier, we will also use this for directory names - "description": """Oral bioavailability is defined as the rate and extent to which the -active ingredient or active moiety is absorbed from a drug product and becomes -available at the site of action.""", - "targets": [ - { - "id": "bioavailable", # name of the column in a tabular dataset - "description": "whether it is bioavailable (1) or not (0)", # description of what this column means - "units": None, # units of the values in this column (leave empty if unitless) - "type": "boolean", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "oral bioavailability"}, - {"adjective": "orally bioavailable"}, - ], - "uris": [ - "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C70913", - ], - }, - ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", - "description": "SMILES", # description (optional, except for "Other") - }, - { - "id": "compound_name", # column name - "type": "Other", - "names": [ - {"noun": "compound name"}, - {"noun": "drug name"}, - {"noun": "generic drug name"}, - ], - "description": "drug name", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1016/j.jpba.2008.03.023", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/adme/#bioavailability-ma-et-al", - "description": "data source", - # note: this is not the original data, it is their modified version - # original larger dataset: http://modem.ucsd.edu/adme/databases/databases_bioavailability.htm - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Ma2008, -doi = {10.1016/j.jpba.2008.03.023}, -url = {https://doi.org/10.1016/j.jpba.2008.03.023}, -year = {2008}, -month = aug, -publisher = {Elsevier BV}, -volume = {47}, -number = {4-5}, -author = {Chang-Ying Ma and Sheng-Yong Yang and Hui Zhang -and Ming-Li Xiang and Qi Huang and Yu-Quan Wei}, -title = {Prediction models of human plasma protein binding rate and -oral bioavailability derived by using GA-CG-SVM method}, -journal = {Journal of Pharmaceutical and Biomedical Analysis}""", - ], - "templates": [ - "The molecule with the {SMILES__description} {#representation of |!}{SMILES#} has a {bioavailable#low&high} {bioavailable__names__noun}.", # noqa: E501 - "Based on the {SMILES__description} {#representation of |!}{SMILES#}, the molecule has a {bioavailable#low&high} {bioavailable__names__noun}.", # noqa: E501 - "The {SMILES__description} {SMILES#} represents a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}.", # noqa: E501 - "The {SMILES__description} {SMILES#} has a {bioavailable#low&high} {bioavailable__names__noun}.", - "The molecule with the {SMILES__description} {SMILES#} has a {bioavailable#low&high} {bioavailable__names__noun}.", # noqa: E501 - # Instruction tuning text templates - """Task: Please classify a molecule based on the description. -Description: Predict if the molecule has a low or high {bioavailable__names__noun}? -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "low" or "high" without using any {#other|additional!} words. -Result: {bioavailable#low&high}""", # noqa: E501 - """Task: Please classify a molecule based on the description. -Description: Predict if the molecule has a low or high {bioavailable__names__noun}? -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Answer the question in a {#full|complete!} sentence. -Result: This molecule has a {bioavailable#low&high} {bioavailable__names__noun}.""", - """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. -Description: A molecule that has a {bioavailable#low&high} {bioavailable__names__noun}. -Result: {SMILES#}""", # noqa: E501 - # Conversational text templates - """User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} has a low or high {bioavailable__names__noun}? -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, this molecule has a {bioavailable#low&high} {bioavailable__names__noun}.""", # noqa: E501 - """User: Has the molecule with the {SMILES__description} {SMILES#} a low or high {bioavailable__names__noun}? -Assistant: It has a {bioavailable#low&high} {bioavailable__names__noun}.""", # noqa: E501 - """User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}? -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#}""", # noqa: E501 - """User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}? -Assistant: {#Ok, this|This!} is a molecule that has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? -User: Yes, please. The molecule should have a {bioavailable#low&high} {bioavailable__names__noun}. -Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? -User: Yes, the molecule should have a {bioavailable#low&high} {bioavailable__names__noun}. -Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} has a {bioavailable#low&high} {bioavailable__names__noun}: {SMILES#}""", # noqa: E501 - # Benchmarking text templates - "Is the {SMILES__description} {SMILES#} {bioavailable__names__adjective}?{bioavailable#no&yes}", # noqa: E501 for the benchmarking setup separates input and output - """Task: Please classify a molecule based on the description. -Description: Predict if the molecule has a low or high {bioavailable__names__noun}? -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "low" or "high" without using any {#other|additional!} words. -Result:{bioavailable#low&high}""", # noqa: E501 - # noqa: E501 """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. - # Description: A molecule that has a {bioavailable#low&high} {bioavailable__names__noun}. - # Result:{SMILES#}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Has the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a high {bioavailable__names__noun}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{bioavailable%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Has the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a high {bioavailable__names__noun}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{bioavailable%} -Answer:{%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules have a high {bioavailable__names__noun}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%bioavailable%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules have a high {bioavailable__names__noun}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%bioavailable%} -Answer:{%multiple_choice_result}""", # noqa: E501 - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/block_polymers_morphology/meta.yaml b/data/tabular/block_polymers_morphology/meta.yaml index 4c3644bf2..845c667fd 100644 --- a/data/tabular/block_polymers_morphology/meta.yaml +++ b/data/tabular/block_polymers_morphology/meta.yaml @@ -64,3 +64,4 @@ templates: Assistant: {#Cool, |Awesome, |Great, |That sounds interesting, !}{#do you have any other constraints?|do you have other requirements?|what else should I take into account?!} User: The {Mn__names__noun} should be {Mn#} {Mn__units}, the {f1__names__noun} should be {f1#}. Assistant: I {#recommend|suggest|propose|advise!} the {#polymer|di-block copolymer|copolymer!} with BigSMILES {BigSMILES#}{Mw#}{D#}. + - The di-block copolymer with BigSMILES {BigSMILES#} {#exhibits|shows!} a {phase1#} phase at {T#} {T__units}, with a number-average molar mass (Mn) of {Mn#} {Mn__units} and a block volume fraction (f1) of {f1#}. Its additional polymer metrics include a mass-average molar mass (Mw) of {Mw#} and a dispersity (D) of {D#}. diff --git a/data/tabular/block_polymers_morphology/transform.py b/data/tabular/block_polymers_morphology/transform.py index de3faff86..324d6b966 100644 --- a/data/tabular/block_polymers_morphology/transform.py +++ b/data/tabular/block_polymers_morphology/transform.py @@ -1,36 +1,13 @@ import pandas as pd +from huggingface_hub import hf_hub_download columns_to_keep = ["phase1", "T", "BigSMILES", "Mn", "f1", "Mw", "D"] def process(): - df = pd.read_csv( - "https://raw.githubusercontent.com/olsenlabmit/BCDB/main/data/diblock.csv" - ) - df = df[df["phase2"].isna()] # remove multiple phases - mw_clean = [] - dispersity_clean = [] - - for mw, dispersity in zip(df["Mw"], df["D"]): - # if nan, make empty string - # else, add the units - if pd.isna(mw) or "nan" in str(mw): - mw_clean.append("REPLACENULL") - else: - mw_clean.append(f", average molecular mass of {mw:.1f} g/mol") - - if pd.isna(dispersity) or "nan" in str(dispersity): - # empty character that will still appear in the csv - dispersity_clean.append("REPLACENULL") - else: - dispersity_clean.append(f", and dispersity of {dispersity:.1f}") - - df["Mw"] = mw_clean - df["D"] = dispersity_clean - df.dropna(subset=columns_to_keep, inplace=True) - print(len(df)) - df[columns_to_keep].to_csv("data_clean.csv", index=False) - + df = hf_hub_download(repo_id="AdrianM0/block_polymers_morphology", filename="diblock.csv", repo_type="dataset") + df = pd.read_csv(df) + df.to_csv("data_clean.csv", index=False) if __name__ == "__main__": process() diff --git a/data/tabular/blood_brain_barrier_martins_et_al/meta.yaml b/data/tabular/blood_brain_barrier_martins_et_al/meta.yaml new file mode 100644 index 000000000..7205be6cd --- /dev/null +++ b/data/tabular/blood_brain_barrier_martins_et_al/meta.yaml @@ -0,0 +1,157 @@ +name: blood_brain_barrier_martins_et_al +description: |- + As a membrane separating circulating blood and brain extracellular + fluid, the blood-brain barrier (BBB) is the protection layer that blocks most + foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver + to the site of action forms a crucial challenge in development of drugs for the + central nervous system. +targets: +- id: penetrate_BBB + description: The ability of a drug to penetrate the blood brain barrier (1) or not + (0) + units: null + type: boolean + names: + - noun: blood brain barrier penetration + - noun: ADME blood-brain barrier penetration + - verb: penetrates the blood brain barrier to reach the brain + - verb: penetrates the blood brain barrier + - adjective: penetrating the blood brain barrier + - adjective: penetrating the blood brain barrier to reach the brain + uris: null +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: +- id: SMILES + type: SMILES + description: SMILES +- id: compound_name + type: Other + names: + - noun: compound name + - noun: drug name + - noun: generic drug name + description: compound name +license: CC BY 4.0 +links: +- url: https://doi.org/10.1021/ci300124c + description: corresponding publication +- url: https://rb.gy/0xx91v + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/adme/#bbb-blood-brain-barrier-martins-et-al + description: data source +num_points: 2030 +bibtex: +- |- + @article{Martins2012, + doi = {10.1021/ci300124c}, + url = {https://doi.org/10.1021/ci300124c}, + year = {2012}, + month = jun, + publisher = {American Chemical Society (ACS)}, + volume = {52}, + number = {6}, + pages = {1686--1697}, + author = {Ines Filipa Martins and Ana L. Teixeira and Luis Pinheiro + and Andre O. Falcao}, + title = {A Bayesian Approach to in Silico Blood-Brain Barrier Penetration Modeling}, + journal = {Journal of Chemical Information and Modeling} +- |- + @article{Wu2018, + doi = {10.1039/c7sc02664a}, + url = {https://doi.org/10.1039/c7sc02664a}, + year = {2018}, + publisher = {Royal Society of Chemistry (RSC)}, + volume = {9}, + number = {2}, + pages = {513--530}, + author = {Zhenqin Wu and Bharath Ramsundar and Evan~N. Feinberg and Joseph + Gomes and Caleb Geniesse and Aneesh S. Pappu and Karl Leswing and Vijay Pande}, + title = {MoleculeNet: a benchmark for molecular machine learning}, + journal = {Chemical Science} +templates: +- The molecule with the {SMILES__description} {SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. +- Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. +- The {SMILES__description} {SMILES#} represents a molecule that is {penetrate_BBB#not&NULL}identified as {penetrate_BBB__names__adjective}. +- The molecule represented with the {SMILES__description} {SMILES#} is {penetrate_BBB#not&NULL}{penetrate_BBB__names__adjective}. +- '{SMILES#} represents a molecule that is {penetrate_BBB#not &NULL}identified as + {penetrate_BBB__names__adjective}.' +- '{SMILES#} represents a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.' +- '{SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.' +- The {#molecule |!}{SMILES__description} {SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. +- |- + Task: Please classify a molecule based on the description. + Description: A molecule that is {penetrate_BBB__names__adjective}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. + Result: {penetrate_BBB#False&True} +- |- + Task: Please classify a molecule based on the description. + Description: A molecule that is {penetrate_BBB__names__adjective}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Answer the question in a {#full|complete!} sentence. + Result: This molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. +- |- + Task: Please {#give me|create|generate!} the {SMILES__description} of {#molecule|chemical|chemical structure!} based on the {#text |!}description{# below|!}. + Description: A molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. + Result: {SMILES#} +- |- + User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} is {penetrate_BBB__names__adjective}? + Assistant: {penetrate_BBB#No&Yes}, this molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. +- |- + User: Is the molecule with the {SMILES__description} {SMILES#} {penetrate_BBB__names__adjective}? + Assistant: {penetrate_BBB#No&Yes}, it is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. +- |- + User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} +- |- + User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? + Assistant: This is a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#} +- |- + User: I want to {#come up with|create|generate!} the {SMILES__description} of a molecule. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? + User: Yes, please. The molecule should {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. + Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#} +- |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? + User: Yes, the molecule should {penetrate_BBB#not &NULL}be {penetrate_BBB__names__adjective}. + Assistant: Got it, this {SMILES__description} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#} +- |- + Is the {SMILES__description} {SMILES#} {penetrate_BBB__names__adjective}: {penetrate_BBB#no&yes} +- |- + Task: Please classify a molecule based on the description. + Description: A molecule that is {penetrate_BBB__names__adjective}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. + Result: {penetrate_BBB#False&True} +- |- + Task: Please answer the multiple choice question. + Question: Is the molecule with the {SMILES__description} of {SMILES#} {penetrate_BBB__names__adjective}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. + Options: + {penetrate_BBB%} + Answer: {%multiple_choice_result} +- |- + Task: Please answer the multiple choice question. + Question: Which molecules are {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%penetrate_BBB%} + Answer: {%multiple_choice_result} +- |- + Task: Please answer the multiple choice question. + Question: Is the molecule with the {SMILES__description} of {SMILES#} {penetrate_BBB__names__adjective}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. + Options: + {penetrate_BBB%} + Answer: {%multiple_choice_result} +- |- + Task: Please answer the multiple choice question. + Question: Which molecules are {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%penetrate_BBB%} + Answer: {%multiple_choice_result} diff --git a/data/tabular/blood_brain_barrier_martins_et_al/transform.py b/data/tabular/blood_brain_barrier_martins_et_al/transform.py index 003c48733..83759e9f0 100644 --- a/data/tabular/blood_brain_barrier_martins_et_al/transform.py +++ b/data/tabular/blood_brain_barrier_martins_et_al/transform.py @@ -43,196 +43,5 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "blood_brain_barrier_martins_et_al", # unique identifier, we will also use this for directory names - "description": """As a membrane separating circulating blood and brain extracellular -fluid, the blood-brain barrier (BBB) is the protection layer that blocks most -foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver -to the site of action forms a crucial challenge in development of drugs for the -central nervous system.""", - "targets": [ - { - "id": "penetrate_BBB", # name of the column in a tabular dataset - "description": "The ability of a drug to penetrate the blood brain barrier (1) or not (0)", - "units": None, # units of the values in this column (leave empty if unitless) - "type": "boolean", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "blood brain barrier penetration"}, - {"noun": "ADME blood-brain barrier penetration"}, - {"verb": "penetrates the blood brain barrier to reach the brain"}, - {"verb": "penetrates the blood brain barrier"}, - {"adjective": "penetrating the blood brain barrier"}, - { - "adjective": "penetrating the blood brain barrier to reach the brain" - }, - ], - "uris": None, - }, - ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", - "description": "SMILES", # description (optional, except for "Other") - }, - { - "id": "compound_name", # column name - "type": "Other", - "names": [ - {"noun": "compound name"}, - {"noun": "drug name"}, - {"noun": "generic drug name"}, - ], - "description": "compound name", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1021/ci300124c", - "description": "corresponding publication", - }, - { - "url": "https://rb.gy/0xx91v", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/adme/#bbb-blood-brain-barrier-martins-et-al", - "description": "data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Martins2012, -doi = {10.1021/ci300124c}, -url = {https://doi.org/10.1021/ci300124c}, -year = {2012}, -month = jun, -publisher = {American Chemical Society (ACS)}, -volume = {52}, -number = {6}, -pages = {1686--1697}, -author = {Ines Filipa Martins and Ana L. Teixeira and Luis Pinheiro -and Andre O. Falcao}, -title = {A Bayesian Approach to in Silico Blood-Brain Barrier Penetration Modeling}, -journal = {Journal of Chemical Information and Modeling}""", - """@article{Wu2018, -doi = {10.1039/c7sc02664a}, -url = {https://doi.org/10.1039/c7sc02664a}, -year = {2018}, -publisher = {Royal Society of Chemistry (RSC)}, -volume = {9}, -number = {2}, -pages = {513--530}, -author = {Zhenqin Wu and Bharath Ramsundar and Evan~N. Feinberg and Joseph -Gomes and Caleb Geniesse and Aneesh S. Pappu and Karl Leswing and Vijay Pande}, -title = {MoleculeNet: a benchmark for molecular machine learning}, -journal = {Chemical Science}""", - ], - "templates": [ - "The molecule with the {SMILES__description} {SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.", # noqa: E501 - "Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.", # noqa: E501 - "The {SMILES__description} {SMILES#} represents a molecule that is {penetrate_BBB#not &NULL}identified as {penetrate_BBB__names__adjective}.", # noqa: E501 - "The molecule represented with the {SMILES__description} {SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.", # noqa: E501 - "{SMILES#} represents a molecule that is {penetrate_BBB#not &NULL}identified as {penetrate_BBB__names__adjective}.", # noqa: E501 - "{SMILES#} represents a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.", # noqa: E501 - "{SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.", - "The {#molecule |!}{SMILES__description} {SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.", # noqa: E501 - # Instruction tuning text templates - """Task: Please classify a molecule based on the description. -Description: A molecule that is {penetrate_BBB__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result: {penetrate_BBB#False&True}""", # noqa: E501 - """Task: Please classify a molecule based on the description. -Description: A molecule that is {penetrate_BBB__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Answer the question in a {#full|complete!} sentence. -Result: This molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.""", # noqa: E501 - """Task: Please {#give me|create|generate!} the {SMILES__description} of {#molecule|chemical|chemical structure!} based on the {#text |!}description{# below|!}. -Description: A molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. -Result: {SMILES#}""", # noqa: E501 - # Conversational text templates - """User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} is {penetrate_BBB__names__adjective}? -Assistant: {penetrate_BBB#No&Yes}, this molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.""", # noqa: E501 - """User: Is the molecule with the {SMILES__description} {SMILES#} {penetrate_BBB__names__adjective}? -Assistant: {penetrate_BBB#No&Yes}, it is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}.""", # noqa: E501 - """User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#}""", # noqa: E501 - """User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? -Assistant: This is a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} the {SMILES__description} of a molecule. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? -User: Yes, please. The molecule should {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. -Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? -User: Yes, the molecule should {penetrate_BBB#not &NULL}be {penetrate_BBB__names__adjective}. -Assistant: Got it, this {SMILES__description} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#}""", # noqa: E501 - # Benchmarking text templates - "Is the {SMILES__description} {SMILES#} {penetrate_BBB__names__adjective}:{penetrate_BBB#no&yes}", # noqa: E501 for the benchmarking setup separates input and output - # todo: check if we go for multiple choice only and remove the benchmarking template above and below - """Task: Please classify a molecule based on the description. -Description: A molecule that is {penetrate_BBB__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result:{penetrate_BBB#False&True}""", # noqa: E501 - # noqa: E501 """Task: Please {#give me|create|generate!} a molecule {SMILES__description} based on the {#text |!}description{# below|!}. - # Description: A molecule that is {penetrate_BBB__names__adjective}. - # Result:{SMILES#}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Is the molecule with the {SMILES__description} of {SMILES#} {penetrate_BBB__names__adjective}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{penetrate_BBB%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%penetrate_BBB%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Is the molecule with the {SMILES__description} of {SMILES#} {penetrate_BBB__names__adjective}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{penetrate_BBB%} -Answer:{%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%penetrate_BBB%} -Answer:{%multiple_choice_result}""", # noqa: E501 - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/caco2_wang/meta.yaml b/data/tabular/caco2_wang/meta.yaml index 5d4fb41da..d7dca095f 100644 --- a/data/tabular/caco2_wang/meta.yaml +++ b/data/tabular/caco2_wang/meta.yaml @@ -54,3 +54,82 @@ bibtex: year={2016}, publisher={ACS Publications} } +templates: + - The molecule with the {SMILES__description} {#representation of |!}{SMILES#} has a {permeability__names__noun} of {permeability#} {permeability__units}. + - Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule has a {permeability__names__noun} of {permeability#} {permeability__units}. + - The {SMILES__description} {SMILES#} {#represents|is from!} a molecule with a {permeability__names__noun} of {permeability#} {permeability__units}. + - The {#molecule |!}{SMILES__description} {SMILES#} has a {permeability__names__noun} of {permeability#} {permeability__units}. + - |- + Task: Please predict a property for a molecule based on the description. + Description: Predict the {permeability__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: You must provide a numerical estimate in units of {permeability__units}. + Result: {permeability#} + - |- + Task: Please predict a property for a molecule based on the description. + Description: Predict the {permeability__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Answer the question in a {#full|complete!} sentence. + Result: This molecule has a {permeability__names__noun} of {permeability#} {permeability__units}. + - |- + Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. + Description: A molecule with a {permeability__names__noun} of approximately {permeability#} {permeability__units}. + Result: {SMILES#} + - |- + User: Can you {#tell me|derive|estimate!} the {permeability__names__noun} for the molecule with the {SMILES__description} {SMILES#}? + Assistant: Based on my analysis, the {permeability__names__noun} for this molecule is approximately {permeability#} {permeability__units}. + - |- + User: What is the {permeability__names__noun} value for the molecule with the {SMILES__description} {SMILES#}? + Assistant: The {permeability__names__noun} value for this molecule is approximately {permeability#} {permeability__units}. + - |- + User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that has a {permeability__names__noun} value of approximately {permeability#} {permeability__units}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} + - |- + User: I'm {#searching|looking!} for the {SMILES__description} of a molecule with a {permeability__names__noun} value close to {permeability#} {permeability__units}. + Assistant: This molecule has a {permeability__names__noun} value of approximately {permeability#} {permeability__units}: {SMILES#} + - |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? + User: Yes, please. The molecule should have a {permeability__names__noun} value of approximately {permeability#} {permeability__units}. + Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} has a {permeability__names__noun} value of approximately {permeability#} {permeability__units}: {SMILES#} + - |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? + User: Yes, the molecule should have a {permeability__names__noun} value close to {permeability#} {permeability__units}. + Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} has a {permeability__names__noun} value of approximately {permeability#} {permeability__units}: {SMILES#} + - What is the {permeability__names__noun} value for the {SMILES__description} {SMILES#}:{permeability#} + - |- + Task: Please predict a property for a molecule based on the description. + Description: Predict the {permeability__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: You must provide a numerical estimate in units of {permeability__units}. + Result: {permeability#} + - |- + Task: Please answer the multiple choice question. + Question: Which molecule has the highest {permeability__names__noun}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%permeability%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Which molecule has the highest {permeability__names__noun}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%permeability%} + Answer: {%multiple_choice_result} + + - |- + Task: Please answer the multiple choice question. + Question: Rank these molecules from lowest to highest {permeability__names__noun}. + Constraint: You must select all options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%permeability%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Rank these molecules from lowest to highest {permeability__names__noun}. + Constraint: You must select all options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%permeability%} + Answer: {%multiple_choice_result} diff --git a/data/tabular/caco2_wang/transform.py b/data/tabular/caco2_wang/transform.py index 6a333f852..c21943785 100644 --- a/data/tabular/caco2_wang/transform.py +++ b/data/tabular/caco2_wang/transform.py @@ -44,102 +44,6 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "caco2_wang", # unique identifier, we will also use this for directory names - "description": """The human colon epithelial cancer cell line, Caco-2, -is used as an in vitro model to simulate the human intestinal tissue. -The experimental result on the rate of drug passing through -the Caco-2 cells can approximate the rate at which the drug permeates -through the human intestinal tissue.""", - "targets": [ - { - "id": "permeability", # name of the column in a tabular dataset - "description": "Caco-2 cell effective permeability.", # description of what this column means - "units": "cm/s", - "type": "continuous", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "Caco-2 cell effective permeability"}, - {"noun": "Caco-2 cell permeability"}, - {"noun": "Caco-2 permeability"}, - ], - "pubchem_aids": [678378], - "uris": [ - "http://www.bioassayontology.org/bao#BAO_0010008", - "http://purl.obolibrary.org/obo/MI_2162", - ], - }, - ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", - "description": "SMILES", # description (optional, except for "Other") - }, - { - "id": "compound_name", - "type": "Other", - "description": "compound name", - "names": [ - {"noun": "compound"}, - {"noun": "compound name"}, - ], - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al", - "description": "original data set link", - }, - { - "url": "https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{wang2016adme, -title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability -using a combination of NSGA-II and boosting}, -author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao, -Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng}, -journal={Journal of Chemical Information and Modeling}, -volume={56}, -number={4}, -pages={763--773}, -year={2016}, -publisher={ACS Publications} -}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: - https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/carcinogens/meta.yaml b/data/tabular/carcinogens/meta.yaml index abcf07603..66503229b 100644 --- a/data/tabular/carcinogens/meta.yaml +++ b/data/tabular/carcinogens/meta.yaml @@ -105,13 +105,13 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {carcinogen#not &NULL}be {carcinogen__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {carcinogen#not &NULL}{carcinogen__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {carcinogen__names__adjective}:{carcinogen#no&yes} + - Is the {SMILES__description} {SMILES#} {carcinogen__names__adjective}:{carcinogen#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {carcinogen__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{carcinogen#False&True} + Result: {carcinogen#False&True} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {carcinogen__names__adjective}? @@ -125,7 +125,7 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {carcinogen%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} - |- Task: Please answer the multiple choice question. Question: Which molecules are {carcinogen#not &NULL}{carcinogen__names__adjective}? @@ -139,4 +139,4 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%carcinogen%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} diff --git a/data/tabular/carcinogens/transform.py b/data/tabular/carcinogens/transform.py index 62aef4568..1488507ab 100644 --- a/data/tabular/carcinogens/transform.py +++ b/data/tabular/carcinogens/transform.py @@ -53,180 +53,6 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "carcinogens", # unique identifier, we will also use this for directory names - "description": """A carcinogen is any substance, radionuclide, or radiation that promotes -carcinogenesis, the formation of cancer. This may be due to the ability to damage -the genome or to the disruption of cellular metabolic processes.""", - "targets": [ - { - "id": "carcinogen", # name of the column in a tabular dataset - "description": "whether it is carcinogenic (1) or not (0).", - "units": None, - "type": "boolean", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "carcinogen"}, - # {"noun": "substance that promotes carcinogenesis"}, - {"adjective": "carcinogenic"}, - {"gerund": "having the potential to cause cancer"}, - ], - "uris": [ - "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C347", - "http://purl.bioontology.org/ontology/SNOMEDCT/88376000", - ], - }, - ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1002/qsar.200860192", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1021/ci300367a", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/tox/#carcinogens", - "description": "Data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Lagunin2009, -doi = {10.1002/qsar.200860192}, -url = {https://doi.org/10.1002/qsar.200860192}, -year = {2009}, -month = jun, -publisher = {Wiley}, -volume = {28}, -number = {8}, -pages = {806--810}, -author = {Alexey Lagunin and Dmitrii Filimonov and Alexey Zakharov and Wei Xie -and Ying Huang and Fucheng Zhu and Tianxiang Shen and Jianhua Yao and Vladimir Poroikov}, -title = {Computer-Aided Prediction of Rodent Carcinogenicity by PASS and CISOC PSCT}, -journal = {QSAR & Combinatorial Science}""", - """@article{Cheng2012, -doi = {10.1021/ci300367a}, -url = {https://doi.org/10.1021/ci300367a}, -year = {2012}, -month = nov, -publisher = {American Chemical Society (ACS)}, -volume = {52}, -number = {11}, -pages = {3099--3105}, -author = {Feixiong Cheng and Weihua Li and Yadi Zhou and Jie Shen and Zengrui Wu -and Guixia Liu and Philip W. Lee and Yun Tang}, -title = {admetSAR: A Comprehensive Source and Free Tool for Assessment of Chemical ADMET Properties}, -journal = {Journal of Chemical Information and Modeling}""", - ], - "templates": [ - "The molecule with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} {carcinogen#no &NULL}{carcinogen__names__adjective} {#properties|effects!}.", # noqa: E501 - "Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule has {carcinogen#no &NULL}{carcinogen__names__adjective} {#effects|properties|characteristics|features!}.", # noqa: E501 - "The {SMILES__description} {SMILES#} represents a molecule that is {carcinogen#not &NULL}identified as {carcinogen__names__adjective}.", # noqa: E501 - "The {SMILES__description} {SMILES#} is {carcinogen#not &NULL}{carcinogen__names__adjective}.", - "The {#molecule |!}{SMILES__description} {SMILES#} is {carcinogen#not &NULL}{carcinogen__names__adjective}.", # noqa: E501 not all variables need to be used - # Instruction tuning text templates - """Task: Please classify a molecule based on the description. -Description: A molecule that is {carcinogen__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result: {carcinogen#False&True}""", # noqa: E501 - """Task: Please classify a molecule based on the description. -Description: A molecule that is {carcinogen__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Answer the question in a {#full|complete!} sentence. -Result: This molecule is {carcinogen#not &NULL}{carcinogen__names__adjective}.""", - """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. -Description: A molecule that is {carcinogen#not &NULL}{carcinogen__names__adjective}. -Result: {SMILES#}""", # noqa: E501 - # Conversational text templates - """User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} is {carcinogen__names__adjective}? -Assistant: {carcinogen#No&Yes}, this molecule is {carcinogen#not &NULL}{carcinogen__names__adjective}.""", # noqa: E501 - """User: Is the molecule with the {SMILES__description} {SMILES#} {carcinogen__names__adjective}? -Assistant: {carcinogen#No&Yes}, it is {carcinogen#not &NULL}{carcinogen__names__adjective}.""", # noqa: E501 - """User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that is {carcinogen#not &NULL}{carcinogen__names__adjective}? -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#}""", # noqa: E501 - """User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that is {carcinogen#not &NULL}{carcinogen__names__adjective}? -Assistant: This is a molecule that is {carcinogen#not &NULL}{carcinogen__names__adjective}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? -User: Yes, please. The molecule should {carcinogen#not &NULL}be {carcinogen__names__adjective}. -Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} is {carcinogen#not &NULL}{carcinogen__names__adjective}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? -User: Yes, the molecule should {carcinogen#not &NULL}be {carcinogen__names__adjective}. -Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {carcinogen#not &NULL}{carcinogen__names__adjective}: {SMILES#}""", # noqa: E501 - # Benchmarking text templates - "Is the {SMILES__description} {SMILES#} {carcinogen__names__adjective}:{carcinogen#no&yes}", # noqa: E501 for the benchmarking setup separates input and output - """Task: Please classify a molecule based on the description. -Description: A molecule that is {carcinogen__names__adjective}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result:{carcinogen#False&True}""", # noqa: E501 - # noqa: E501 """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. - # Description: A molecule that is {carcinogen__names__adjective}. - # Result:{SMILES#}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {carcinogen__names__adjective}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{carcinogen%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {carcinogen__names__adjective}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{carcinogen%} -Answer:{%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {carcinogen#not &NULL}{carcinogen__names__adjective}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%carcinogen%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {carcinogen#not &NULL}{carcinogen__names__adjective}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%carcinogen%} -Answer:{%multiple_choice_result}""", # noqa: E501 - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/cav3_t-type_calcium_channels_butkiewicz/meta.yaml b/data/tabular/cav3_t-type_calcium_channels_butkiewicz/meta.yaml index 03c4c5ea1..8061265cd 100644 --- a/data/tabular/cav3_t-type_calcium_channels_butkiewicz/meta.yaml +++ b/data/tabular/cav3_t-type_calcium_channels_butkiewicz/meta.yaml @@ -130,26 +130,27 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_cav3_t_type_calcium_channels#not &NULL}be {activity_cav3_t_type_calcium_channels__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_cav3_t_type_calcium_channels__names__gerund}:{activity_cav3_t_type_calcium_channels#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_cav3_t_type_calcium_channels__names__gerund}:{activity_cav3_t_type_calcium_channels#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_cav3_t_type_calcium_channels__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_cav3_t_type_calcium_channels#False&True} + Result: {activity_cav3_t_type_calcium_channels#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_cav3_t_type_calcium_channels__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}. + Result:This molecule is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_cav3_t_type_calcium_channels__names__gerund}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_cav3_t_type_calcium_channels%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}? @@ -163,4 +164,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%activity_cav3_t_type_calcium_channels%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cav3_t-type_calcium_channels_butkiewicz/transform.py b/data/tabular/cav3_t-type_calcium_channels_butkiewicz/transform.py index 5b2a89c2a..c8455c3a9 100644 --- a/data/tabular/cav3_t-type_calcium_channels_butkiewicz/transform.py +++ b/data/tabular/cav3_t-type_calcium_channels_butkiewicz/transform.py @@ -35,199 +35,6 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "cav3_t-type_calcium_channels_butkiewicz", - "description": """This dataset was initially curated from HTS data at the PubChem database. -The curation process is documented in Butkiewicz et al. -Primary screening with AID 449739 identified inhibitors of Cav3 T-type calcium channels. -Four follow-up screens were performed to confirm inhibitory effects on smaller sets of compounds -involving AID 493021, AID 493022, AID 493023, and AID 493041. -AID 489005 was performed as counter screen validating active compounds of the primary screen.""", - "targets": [ - { - "id": "activity_cav3_t_type_calcium_channels", # name of the column in a tabular dataset - "description": "whether it active against cav3 t-type calcium channels receptor (1) or not (0)", - "units": None, - "type": "boolean", - "names": [ - {"noun": "inhibition of the cav3 t-type calcium channel activity"}, - {"adjective": "cav3 t-type calcium channel inhibition"}, - { - "gerund": "inhibiting the activity of cav3 t-type calcium channels" - }, - {"verb": "blocks t-type calcium channels"}, - {"verb": "inhibits cav3 t-type calcium channels"}, - ], - "pubchem_aids": [1053190, 489005, 493021, 493022, 493023, 493041], - "uris": ["http://purl.obolibrary.org/obo/CHEBI_194338"], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "description": "original dataset", - }, - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "benchmarks": [ - { - "name": "TDC", - "link": "https://tdcommons.ai/", - "split_column": "split", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Butkiewicz2013, -doi = {10.3390/molecules18010735}, -url = {https://doi.org/10.3390/molecules18010735}, -year = {2013}, -month = jan, -publisher = {{MDPI} {AG}}, -volume = {18}, -number = {1}, -pages = {735--756}, -author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall -and Pedro Teixeira and C. Weaver and Jens Meiler}, -title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, -journal = {Molecules}}""", - """@article{Kim2018, -doi = {10.1093/nar/gky1033}, -url = {https://doi.org/10.1093/nar/gky1033}, -year = {2018}, -month = oct, -publisher = {Oxford University Press ({OUP})}, -volume = {47}, -number = {D1}, -pages = {D1102--D1109}, -author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He -and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky -and Jian Zhang and Evan E Bolton}, -title = {{PubChem} 2019 update: improved access to chemical data}, -journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, -doi = {}, -url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, -year = {2017}, -publisher = {Chem Inform}, -volume = {3}, -number = {1}, -author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. -and Meiler, J.}, -title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, -journal = {Chemical Science}}""", - ], - "templates": [ - "The molecule with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} {activity_cav3_t_type_calcium_channels#no &NULL}{activity_cav3_t_type_calcium_channels__names__noun}.", # noqa: E501 - "Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule {#shows|exhibits|displays!} {activity_cav3_t_type_calcium_channels#no &NULL}{activity_cav3_t_type_calcium_channels__names__noun}.", # noqa: E501 - "The {SMILES__description} {SMILES#} represents a molecule that {#shows|exhibits|displays!} {activity_cav3_t_type_calcium_channels#no &NULL}{activity_cav3_t_type_calcium_channels__names__noun}.", # noqa: E501 - "The {#molecule |!}{SMILES__description} {SMILES#} is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}.", # noqa: E501 not all variables need to be used - # Instruction tuning text templates - """Task: Please classify a molecule based on the description. -Description: A molecule that is {activity_cav3_t_type_calcium_channels__names__gerund}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result: {activity_cav3_t_type_calcium_channels#False&True}""", # noqa: E501 - """Task: Please classify a molecule based on the description. -Description: A molecule that is {activity_cav3_t_type_calcium_channels__names__gerund}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Answer the question in a {#full|complete!} sentence. -Result: This molecule is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}.""", # noqa: E501 - """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. -Description: A molecule that is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}. -Result: {SMILES#}""", # noqa: E501 - # Conversational text templates - """User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} is {activity_cav3_t_type_calcium_channels__names__gerund}? -Assistant: {activity_cav3_t_type_calcium_channels#No&Yes}, this molecule is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}.""", # noqa: E501 - """User: Is the molecule with the {SMILES__description} {SMILES#} {activity_cav3_t_type_calcium_channels__names__gerund}? -Assistant: {activity_cav3_t_type_calcium_channels#No&Yes}, it is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}.""", # noqa: E501 - """User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}? -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#}""", # noqa: E501 - """User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}? -Assistant: This is a molecule that is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? -User: Yes, please. The molecule should {activity_cav3_t_type_calcium_channels#not &NULL}be {activity_cav3_t_type_calcium_channels__names__gerund}. -Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? -User: Yes, the molecule should {activity_cav3_t_type_calcium_channels#not &NULL}be {activity_cav3_t_type_calcium_channels__names__gerund}. -Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}: {SMILES#}""", # noqa: E501 - # Benchmarking text templates - "Is the {SMILES__description} {SMILES#} {activity_cav3_t_type_calcium_channels__names__gerund}:{activity_cav3_t_type_calcium_channels#no&yes}", # noqa: E501 for the benchmarking setup separates input and output - """Task: Please classify a molecule based on the description. -Description: A molecule that is {activity_cav3_t_type_calcium_channels__names__gerund}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result:{activity_cav3_t_type_calcium_channels#False&True}""", # noqa: E501 - """Task: Please classify a molecule based on the description. -Description: A molecule that is {activity_cav3_t_type_calcium_channels__names__gerund}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Answer the question in a {#full|complete!} sentence. -Result:This molecule is {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}.""", # noqa: E501 - # noqa: E501 """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. - # Description: A molecule that is {activity_cav3_t_type_calcium_channels__names__gerund}. - # Result:{SMILES#}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_cav3_t_type_calcium_channels__names__gerund}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{activity_cav3_t_type_calcium_channels%} -Answer:{%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%activity_cav3_t_type_calcium_channels%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {activity_cav3_t_type_calcium_channels#not &NULL}{activity_cav3_t_type_calcium_channels__names__gerund}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%activity_cav3_t_type_calcium_channels%} -Answer:{%multiple_choice_result}""", # noqa: E501 - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: - https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/chebi_20/meta.yaml b/data/tabular/chebi_20/meta.yaml index 53a853540..31c8d7718 100644 --- a/data/tabular/chebi_20/meta.yaml +++ b/data/tabular/chebi_20/meta.yaml @@ -100,8 +100,8 @@ templates: Task: Please create a {#text |!}description for a molecule{# based on its representation|!}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question with {#full|complete!} sentences. - Result:{description#} + Result: {description#} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: {description#} - Result:{SMILES#} + Result: {SMILES#} diff --git a/data/tabular/chebi_20/transform.py b/data/tabular/chebi_20/transform.py index ec9f0b68a..1296b91ca 100644 --- a/data/tabular/chebi_20/transform.py +++ b/data/tabular/chebi_20/transform.py @@ -1,6 +1,8 @@ import datasets import pandas as pd import yaml +from huggingface_hub import hf_hub_download + SPLITS = ["train", "test", "validation"] ORIGINAL_COLUMNS = ["CID", "SMILES", "description"] @@ -137,7 +139,11 @@ def get_dataset(split: str) -> datasets.Dataset: https://huggingface.co/docs/datasets/upload_dataset """ # 3 splits of train, val, test - return datasets.load_dataset("OpenBioML/chebi_20", split=split, delimiter="\t") + # return datasets.load_dataset("OpenBioML/chebi_20", split=split, delimiter="\t") + df = hf_hub_download( + repo_id="OpenBioML/chebi_20", repo_type="dataset", filename=split + ".csv" + ) + return pd.read_csv(df, delimiter="\t") def remove_whitespace(sample: dict) -> dict: @@ -147,19 +153,20 @@ def remove_whitespace(sample: dict) -> dict: def clean_dataset(hf_data: datasets.Dataset) -> datasets.Dataset: """Clean the dataset""" - assert list(hf_data.features.keys()) == ORIGINAL_COLUMNS + assert list(hf_data.columns) == ORIGINAL_COLUMNS for old, new in zip(ORIGINAL_COLUMNS, NEW_COLUMNS): if old != new: - hf_data.rename_column(old, new) - return hf_data.map(remove_whitespace, num_proc=4) - + # rename pandas columns + hf_data = hf_data.rename(columns={old: new}) + # return hf_data.map(remove_whitespace, num_proc=4) + return hf_data.apply(remove_whitespace, axis=1) def create_meta_yaml(num_points: int): """Create meta configuration file for the dataset""" # create meta yaml META_TEMPLATE["num_points"] = num_points - with open(META_YAML_PATH, "w+") as f: - yaml.dump(META_TEMPLATE, f, sort_keys=False) + # with open(META_YAML_PATH, "w+") as f: + # yaml.dump(META_TEMPLATE, f, sort_keys=False) print(f"Finished processing chebi-20 {META_TEMPLATE['name']} dataset!") @@ -169,8 +176,8 @@ def create_meta_yaml(num_points: int): for split in SPLITS: hf_data = get_dataset(split) hf_data_clean = clean_dataset(hf_data) - num_samples += hf_data_clean.num_rows - df_tmp = hf_data_clean.to_pandas() + num_samples += hf_data_clean.shape[0] + df_tmp = hf_data_clean.copy() # TODO: Split information is not used here and in the YAML file, # better use the split defined by all other files. # df_tmp["split"] = split if split != "validation" else "valid" diff --git a/data/tabular/chem_caption_smarts/meta.yaml b/data/tabular/chem_caption_smarts/meta.yaml index 7e8982ccf..b6f45e027 100644 --- a/data/tabular/chem_caption_smarts/meta.yaml +++ b/data/tabular/chem_caption_smarts/meta.yaml @@ -25,6 +25,7 @@ license: CC BY 4.0 links: - url: https://github.com/lamalab-org/chem-caption description: Original codebase used to generate this dataset +num_points: 812177 templates: - |- Question: {#How many times|How often!} does the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contain the substructure with the {smarts__names__noun} {#smarts#}? @@ -38,3 +39,7 @@ templates: - |- User: {#I want to|I have to|I must|I would like to!} know how many times the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains a {completion#} substructure. Assistant: The {#molecule|chemical|compound|chemical structure!} contains the substructure with the {smarts__names__noun} {#smarts#} {completion#} times. + - The {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains the substructure with the {smarts__names__noun} {#smarts#} {completion#} times. + - |- + Task: {#Determine|Calculate|Estimate|Predict!} the number of times the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains the substructure with the {smarts__names__noun} {#smarts#}. + Solution: {completion#} \ No newline at end of file diff --git a/data/tabular/chembl_v29/transform.py b/data/tabular/chembl_v29/transform.py index 38d611caf..e732dcb99 100644 --- a/data/tabular/chembl_v29/transform.py +++ b/data/tabular/chembl_v29/transform.py @@ -112,8 +112,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/chemdner/meta.yaml b/data/tabular/chemdner/meta.yaml index dad41aa94..d990c9c37 100644 --- a/data/tabular/chemdner/meta.yaml +++ b/data/tabular/chemdner/meta.yaml @@ -94,3 +94,9 @@ templates: User: Does the following text contain mentions of {#chemicals|chemical compounds|chemical substances!}? {#Can you return matches?|Can you output matches?|Please return matches.!} {#Text: |!}{sentence#} Assistant: {#I found|There is!} {matched_words#}. + - |- + The chemical entities in "{sentence#}" are "{matched_words#}". + - |- + After analyzing the text "{sentence#}", the identified chemical compounds are "{matched_words#}". + - |- + The {#result|output|answer!} of chemical entity extraction on "{sentence#}" is "{matched_words#}". diff --git a/data/tabular/chemistry_stackexchange/meta.yaml b/data/tabular/chemistry_stackexchange/meta.yaml index 33e973401..c417aa2f9 100644 --- a/data/tabular/chemistry_stackexchange/meta.yaml +++ b/data/tabular/chemistry_stackexchange/meta.yaml @@ -28,3 +28,9 @@ templates: {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!} {#Question: |Inquiry: |\n!}{#q} {#Assistant: |Title: |Answer: |!}{#title} + - |- + {#Task: Generate a question based on the answer.|Task: Create a question that corresponds to the answer.|Task: Formulate a question that matches the answer.|Task: Develop a question that aligns with the answer.|Task: Construct a question that is answered by the provided response.|Task: Create a question that is relevant to the answer.!} + {#Answer: |Response: |Solution: |!}{#a} + {#Assistant: |Question: |Inquiry: |!}{#q} + - The answer to the {#question|help request|query!} "{#q}" is "{#a}". + - The title of the {#question|help request|query!} "{#q}" is "{#title}". diff --git a/data/tabular/choline_transporter_butkiewicz/meta.yaml b/data/tabular/choline_transporter_butkiewicz/meta.yaml index 25e6d7ce0..1ec54b224 100644 --- a/data/tabular/choline_transporter_butkiewicz/meta.yaml +++ b/data/tabular/choline_transporter_butkiewicz/meta.yaml @@ -1,117 +1,174 @@ -name: bicerano_dataset +name: choline_transporter_butkiewicz description: |- - This paper outlines a MD simulation workflow based on GPU MD simulation and the - refined optimized potentials for liquid simulation (OPLS) OPLS3e force field to - calculate glass transition temperatures (Tgs) of 315 polymers for which Bicerano - reported experimental values. + This dataset was originally curated from HTS data at + the PubChem database. The primary screen AID 488975 identified + inhibitors of CHT. The counter screen AID 493221 was used as a + validation screen to confirm the active compounds that inhibit CHT. + AID504840 and AID588401 experiments were used as additional validation + experiments. The screen AID 493222 evaluated remaining active compounds + for non-specific activity in parental HEK293 cells. AID602208 tested a + selected set of compounds for 3H choline uptake. The final set of 254 + active compounds was determined by the overlap of active compounds in + screens AID 493221, AID504840, and AID588401 subtracting any + non-specific hits from AID 49322 and all inactive compounds in the + re-confirmation screen AID602208. targets: - - id: Tg_exp - description: experimental glass transition temperature - units: K - type: float - names: - - noun: experimental glass transition temperature - uris: - - id: Tg_calc - description: calculated glass transition temperature - units: K - type: float - names: - - noun: MD-computed glass transition temperature (OPLS3e force field) - - noun: computed glass transition temperature (using MD with OPLS3e force field) - - id: rho_300K_calc - description: computed density at 300K - units: g/cm^3 - type: float - names: - - noun: computed polymer density at 300K (using MD with OPLS3e force field) - - noun: computed density at 300K (using MD with OPLS3e force field) +- id: activity_choline_transporter + description: inhibition of choline transporter receptor (1) or not (0). + units: null + type: boolean + names: + - noun: inhibition of choline transporter activity + - adjective: choline transporter activity inhibition + - gerund: inhibiting the choline transporter activity + - verb: inhibits choline transporter activity + pubchem_aids: + - 488975 + - 493221 + - 504840 + - 588401 + - 493222 + - 602208 +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: PSMILES - type: PSMILES - description: PSMILES - - id: compound_name - type: Other - names: - - noun: compound name - description: polymer name +- id: SMILES + type: SMILES + description: SMILES license: CC BY 4.0 links: - - url: https://pubs.acs.org/doi/10.1021/acsapm.0c00524# - description: corresponding publication - - url: - - https://raw.githubusercontent.com/AdrianM0/chemnlp/main/data/tabular/bicerano_dataset/HT_MD_polymer_properties.csv - description: data source -num_points: 315 +- url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al + description: original dataset +- url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication +- url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication +- url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +num_points: 302306 bibtex: - - |- - @article{afzal2021, - author = {Afzal, Mohammad Atif Faiz and Browning, Andrea R. and Goldberg, Alexander and Halls, Mathew D. and Gavartin, Jacob L. and Morisato, - Tsuguo and Hughes, Thomas F. and Giesen, David J. and Goose, Joseph E.}, - title = {High-Throughput Molecular Dynamics Simulations and Validation of Thermophysical Properties of Polymers for Various Applications}, - journal = {ACS Applied Polymer Materials}, - volume = {3}, - number = {2}, - pages = {620-630}, - year = {2021}, - doi = {10.1021/acsapm.0c00524}} +- |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and + Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens + Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput + Screening with the {PubChem} Database}, + journal = {Molecules}} +- |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and + Asta Gindulyte and Jia He and Siqian He and Qingliang Li and + Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid + Zaslavsky and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} +- |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, + E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from + the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} templates: - - The polymer with the {PSMILES__description} of {PSMILES#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units}. - - The polymer with the {PSMILES__description} of {PSMILES#} has a {Tg_calc__names__noun} of {Tg_calc#} {Tg_exp__units}. - - The polymer with the {PSMILES__description} of {PSMILES#} has a {rho_300K_calc__names__noun} of {rho_300K_calc#} {rho_300K_calc__units}. - - The polymer with the {compound_name__names__noun} of {compound_name#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units}. - - The polymer with the {compound_name__names__noun} of {compound_name#} has a {Tg_calc__names__noun} of {Tg_calc#} {Tg_calc__units}. - - The polymer with the {compound_name__names__noun} of {compound_name#} has a {rho_300K_calc__names__noun} of {rho_300K_calc#} {rho_300K_calc__units}. - - What is the {Tg_exp__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{Tg_exp#} {Tg_exp__units}. - - What is the {Tg_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{Tg_calc#} {Tg_calc__units}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? Answer:{rho_300K_calc#} {rho_300K_calc__units}. - - What is the {Tg_exp__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{Tg_exp#} {Tg_exp__units}. - - What is the {Tg_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{Tg_calc#} {Tg_calc__units}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? Answer:{rho_300K_calc#} {rho_300K_calc__units}. - - The polymer with the {PSMILES__description} {PSMILES#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units} and a {Tg_calc__names__noun} of {Tg_calc#} {Tg_calc__units}. - - The polymer with the {compound_name__names__noun} {compound_name#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units} and a {Tg_calc__names__noun} of {Tg_calc#} {Tg_calc__units}. - - Compare the {Tg_exp__names__noun} and {Tg_calc__names__noun} for the polymer with the {PSMILES__description} {PSMILES#}. Answer:{Tg_exp#} {Tg_exp__units}, {Tg_calc#} {Tg_calc__units}. - - Compare the {Tg_exp__names__noun} and {Tg_calc__names__noun} for the polymer with the {compound_name__names__noun} {compound_name#}. Answer:{Tg_exp#} {Tg_exp__units}, {Tg_calc#} {Tg_calc__units}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#} at 300K? Answer:{rho_300K_calc#} {rho_300K_calc__units}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#} at 300K? Answer:{rho_300K_calc#} {rho_300K_calc__units}. - - What is the {Tg_exp__names__noun} of the polymer with the {PSMILES__description} {PSMILES#} in Kelvin? Answer:{Tg_exp#}. - - What is the {Tg_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#} in Kelvin? Answer:{Tg_calc#}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#} in g/cm^3? Answer:{rho_300K_calc#}. - - What is the {Tg_exp__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#} in Kelvin? Answer:{Tg_exp#}. - - What is the {Tg_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#} in Kelvin? Answer:{Tg_calc#}. - - What is the {rho_300K_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#} in g/cm^3? Answer:{rho_300K_calc#}. - - The polymer with the {PSMILES__description} {PSMILES#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units} and a {rho_300K_calc__names__noun} of {rho_300K_calc#} {rho_300K_calc__units}. - - The polymer with the {compound_name__names__noun} {compound_name#} has an {Tg_exp__names__noun} of {Tg_exp#} {Tg_exp__units} and a {rho_300K_calc__names__noun} of {rho_300K_calc#} {rho_300K_calc__units}. - - Compare the {Tg_exp__names__noun} and {rho_300K_calc__names__noun} for the polymer with the {PSMILES__description} {PSMILES#}. Answer:{Tg_exp#} {Tg_exp__units}, {rho_300K_calc#} {rho_300K_calc__units}. - - Compare the {Tg_exp__names__noun} and {rho_300K_calc__names__noun} for the polymer with the {compound_name__names__noun} {compound_name#}. Answer:{Tg_exp#} {Tg_exp__units}, {rho_300K_calc#} {rho_300K_calc__units}. + - The molecule with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} {activity_choline_transporter#no &NULL}{activity_choline_transporter__names__noun}. + - Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule {#shows|exhibits|displays!} {activity_choline_transporter#no &NULL}{activity_choline_transporter__names__noun}. + - The {SMILES__description} {SMILES#} represents a molecule that {#shows|exhibits|displays!}{activity_choline_transporter#no &NULL}{activity_choline_transporter__names__noun}. + - The {#molecule |!}{SMILES__description} {SMILES#} is {activity_choline_transporter#not&NULL}{activity_choline_transporter__names__gerund}. - |- - Question: What is the {Tg_exp__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? - Constraint: You must pick either {%multiple_choice_enum%3%aA1} without using any other words. - Options: - {Tg_exp%} - Answer:{%multiple_choice_result} - - Question: What is the {Tg_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? - Constraint: You must pick either {%multiple_choice_enum%3%aA1} without using any other words. - Options: - {Tg_calc%} - Answer:{%multiple_choice_result} - - Question: What is the {rho_300K_calc__names__noun} of the polymer with the {PSMILES__description} {PSMILES#}? - Constraint: You must pick either {%multiple_choice_enum%3%aA1} without using any other words. - Options: - {rho_300K_calc%} - Answer:{%multiple_choice_result} - - Question: What is the {Tg_exp__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? - Constraint: You must pick either {%multiple_choice_enum%3%aA1} without using any other words. + Task: Please classify a molecule based on the description. + Description: A molecule that is {activity_choline_transporter__names__gerund}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. + Result: {activity_choline_transporter#False&True} + - |- + Task: Please classify a molecule based on the description. + Description: A molecule that is {activity_choline_transporter__names__gerund}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Answer the question in a {#full|complete!} sentence. + Result: This molecule is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}. + - |- + Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. + Description: A molecule that is {activity_choline_transporter__names__gerund}. + Result: {SMILES#} + - |- + User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} is {activity_choline_transporter__names__gerund}? + Assistant: {activity_choline_transporter#No&Yes}, this molecule is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}. + - |- + User: Is the molecule with the {SMILES__description} {SMILES#} {activity_choline_transporter__names__gerund}? + Assistant: {activity_choline_transporter#No&Yes}, it is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}. + - |- + User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} + - |- + User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}? + Assistant: This is a molecule that is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}: {SMILES#} + - |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? + User: Yes, please. The molecule should {activity_choline_transporter#not &NULL}be {activity_choline_transporter__names__gerund}. + Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}: {SMILES#} + - |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? + User: Yes, the molecule should {activity_choline_transporter#not &NULL}be {activity_choline_transporter__names__gerund}. + Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}: {SMILES#} + - |- + Is the {SMILES__description} {SMILES#} {activity_choline_transporter__names__gerund}: {activity_choline_transporter#no&yes} + - |- + Task: Please classify a molecule based on the description. + Description: A molecule that is {activity_choline_transporter__names__gerund}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. + Result: {activity_choline_transporter#False&True} + - |- + Task: Please classify a molecule based on the description. + Description: A molecule that is {activity_choline_transporter__names__gerund}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Answer the question in a {#full|complete!} sentence. + Result: This molecule is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}. + - |- + Task: Please answer the multiple choice question. + Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_choline_transporter__names__gerund}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: - {Tg_exp%} - Answer:{%multiple_choice_result} - - Question: What is the {Tg_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? - Constraint: You must pick either {%multiple_choice_enum%3%aA1} without using any other words. + {activity_choline_transporter%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Which molecules are {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: - {Tg_calc%} - Answer:{%multiple_choice_result} - - Question: What is the {rho_300K_calc__names__noun} of the polymer with the {compound_name__names__noun} {compound_name#}? - Constraint: You must pick either {%multiple_choice_enum%3%aA1} without using any other words. + {SMILES%activity_choline_transporter%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Which molecules are {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: - {rho_300K_calc%} - Answer:{%multiple_choice_result} + {SMILES%activity_choline_transporter%} + Answer: {%multiple_choice_result} diff --git a/data/tabular/choline_transporter_butkiewicz/transform.py b/data/tabular/choline_transporter_butkiewicz/transform.py index b4a74553c..4eee74808 100644 --- a/data/tabular/choline_transporter_butkiewicz/transform.py +++ b/data/tabular/choline_transporter_butkiewicz/transform.py @@ -37,207 +37,8 @@ def get_and_transform_data(): # save to csv fn_data_csv = "data_clean.csv" + # shuffle df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "choline_transporter_butkiewicz", - "description": """This dataset was originally curated from HTS data at -the PubChem database. The primary screen AID 488975 identified -inhibitors of CHT. The counter screen AID 493221 was used as a -validation screen to confirm the active compounds that inhibit CHT. -AID504840 and AID588401 experiments were used as additional validation -experiments. The screen AID 493222 evaluated remaining active compounds -for non-specific activity in parental HEK293 cells. AID602208 tested a -selected set of compounds for 3H choline uptake. The final set of 254 -active compounds was determined by the overlap of active compounds in -screens AID 493221, AID504840, and AID588401 subtracting any -non-specific hits from AID 49322 and all inactive compounds in the -re-confirmation screen AID602208.""", - "targets": [ - { - "id": "activity_choline_transporter", # name of the column in a tabular dataset - "description": "inhibition of choline transporter receptor (1) or not (0).", - "units": None, # units of the values in this column (leave empty if unitless) - "type": "boolean", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "inhibition of choline transporter activity"}, - {"adjective": "choline transporter activity inhibition"}, - {"gerund": "inhibiting the choline transporter activity"}, - {"verb": "inhibits choline transporter activity"}, - ], - "pubchem_aids": [488975, 493221, 504840, 588401, 493222, 602208], - }, - ], - "benchmarks": [ - { - "name": "TDC", - "link": "https://tdcommons.ai/", - "split_column": "split", - } - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "description": "original dataset", - }, - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Butkiewicz2013, -doi = {10.3390/molecules18010735}, -url = {https://doi.org/10.3390/molecules18010735}, -year = {2013}, -month = jan, -publisher = {{MDPI} {AG}}, -volume = {18}, -number = {1}, -pages = {735--756}, -author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and -Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens -Meiler}, -title = {Benchmarking Ligand-Based Virtual High-Throughput -Screening with the {PubChem} Database}, -journal = {Molecules}}""", - """@article{Kim2018, -doi = {10.1093/nar/gky1033}, -url = {https://doi.org/10.1093/nar/gky1033}, -year = {2018}, -month = oct, -publisher = {Oxford University Press ({OUP})}, -volume = {47}, -number = {D1}, -pages = {D1102--D1109}, -author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and -Asta Gindulyte and Jia He and Siqian He and Qingliang Li and -Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid -Zaslavsky and Jian Zhang and Evan E Bolton}, -title = {{PubChem} 2019 update: improved access to chemical data}, -journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, -doi = {}, -url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, -year = {2017}, -publisher = {Chem Inform}, -volume = {3}, -number = {1}, -author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, -E. W. and Weaver, D. C. and Meiler, J.}, -title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from -the {P}ub{C}hem {D}atabase}}, -journal = {Chemical Science}}""", - ], - "templates": [ - "The molecule with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} {activity_choline_transporter#no &NULL}{activity_choline_transporter__names__noun}.", # noqa: E501 - "Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule {#shows|exhibits|displays!} {activity_choline_transporter#no &NULL}{activity_choline_transporter__names__noun}.", # noqa: E501 - "The {SMILES__description} {SMILES#} represents a molecule that {#shows|exhibits|displays!} {activity_choline_transporter#no &NULL}{activity_choline_transporter__names__noun}.", # noqa: E501 - "The {#molecule |!}{SMILES__description} {SMILES#} is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}.", # noqa: E501 not all variables need to be used - # Instruction tuning text templates - """Task: Please classify a molecule based on the description. -Description: A molecule that is {activity_choline_transporter__names__gerund}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result: {activity_choline_transporter#False&True}""", # noqa: E501 - """Task: Please classify a molecule based on the description. -Description: A molecule that is {activity_choline_transporter__names__gerund}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Answer the question in a {#full|complete!} sentence. -Result: This molecule is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}.""", # noqa: E501 - """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. -Description: A molecule that is {activity_choline_transporter__names__gerund}. -Result: {SMILES#}""", # noqa: E501 - # Conversational text templates - """User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} is {activity_choline_transporter__names__gerund}? -Assistant: {activity_choline_transporter#No&Yes}, this molecule is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}.""", # noqa: E501 - """User: Is the molecule with the {SMILES__description} {SMILES#} {activity_choline_transporter__names__gerund}? -Assistant: {activity_choline_transporter#No&Yes}, it is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}.""", # noqa: E501 - """User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}? -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#}""", # noqa: E501 - """User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}? -Assistant: This is a molecule that is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? -User: Yes, please. The molecule should {activity_choline_transporter#not &NULL}be {activity_choline_transporter__names__gerund}. -Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}: {SMILES#}""", # noqa: E501 - """User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. -Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? -User: Yes, the molecule should {activity_choline_transporter#not &NULL}be {activity_choline_transporter__names__gerund}. -Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}: {SMILES#}""", # noqa: E501 - # Benchmarking text templates - "Is the {SMILES__description} {SMILES#} {activity_choline_transporter__names__gerund}:{activity_choline_transporter#no&yes}", # noqa: E501 for the benchmarking setup separates input and output - """Task: Please classify a molecule based on the description. -Description: A molecule that is {activity_choline_transporter__names__gerund}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. -Result:{activity_choline_transporter#False&True}""", # noqa: E501 - """Task: Please classify a molecule based on the description. -Description: A molecule that is {activity_choline_transporter__names__gerund}. -{#Molecule |!}{SMILES__description}: {SMILES#} -Constraint: Answer the question in a {#full|complete!} sentence. -Result:This molecule is {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}.""", # noqa: E501 - # noqa: E501 """Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. - # Description: A molecule that is {activity_choline_transporter__names__gerund}. - # Result:{SMILES#}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_choline_transporter__names__gerund}? -Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. -Options: -{activity_choline_transporter%} -Answer:{%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%activity_choline_transporter%} -Answer: {%multiple_choice_result}""", # noqa: E501 - """Task: Please answer the multiple choice question. -Question: Which molecules are {activity_choline_transporter#not &NULL}{activity_choline_transporter__names__gerund}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. -Options: -{SMILES%activity_choline_transporter%} -Answer:{%multiple_choice_result}""", # noqa: E501 - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: - https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/clearance_astrazeneca/meta.yaml b/data/tabular/clearance_astrazeneca/meta.yaml index 81eb58889..10138c8a9 100644 --- a/data/tabular/clearance_astrazeneca/meta.yaml +++ b/data/tabular/clearance_astrazeneca/meta.yaml @@ -64,3 +64,19 @@ bibtex: title = {Mechanistic insights from comparing intrinsic clearance values between human liver microsomes and hepatocytes to guide drug design}, journal = {European Journal of Medicinal Chemistry} + +templates: + - |- + The {drug_clearance__names__noun} of a drug with the {SMILES__description} {SMILES#} is {drug_clearance#}{drug_clearance__units}. + - |- + User: {#I need|I want!} to know the {drug_clearance__names__noun} of a drug with the {SMILES__description} {SMILES#}. + Assistant: The {drug_clearance__names__noun} is {drug_clearance#}{drug_clearance__units}. + - |- + Question: What is the {drug_clearance__names__noun} of a drug with the {SMILES__description} {SMILES#}? + Constraint: Return only the {drug_clearance__names__noun} without {#any additional information|any extra information|any other details!}! + Answer: {drug_clearance#}{drug_clearance__units}. + - |- + Task: {#Predict|Estimate|Calculate|Determine!} the {drug_clearance__names__noun} of a drug with the {SMILES__description} {SMILES#}. + Solution: {drug_clearance#}{drug_clearance__units}. + - |- + The {#molecule|compound!} with the {SMILES__description} {SMILES#} has a {drug_clearance__names__noun} of {drug_clearance#}{drug_clearance__units}. \ No newline at end of file diff --git a/data/tabular/clearance_astrazeneca/transform.py b/data/tabular/clearance_astrazeneca/transform.py index d3a600031..e031084e3 100644 --- a/data/tabular/clearance_astrazeneca/transform.py +++ b/data/tabular/clearance_astrazeneca/transform.py @@ -1,5 +1,4 @@ import pandas as pd -import yaml from tdc.single_pred import ADME @@ -43,108 +42,6 @@ def get_and_transform_data(): # save to csv fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - meta = { - "name": "clearance_astrazeneca", # unique identifier, we will also use this for directory names - "description": """Drug clearance is defined as the volume of plasma cleared of a drug -over a specified time period and it measures the rate at which the active drug -is removed from the body. This is a dataset curated from ChEMBL database containing -experimental results on intrinsic clearance, deposited from AstraZeneca. It -contains clearance measures from two experiments types, hepatocyte and microsomes.""", - "targets": [ - { - "id": "drug_clearance", # name of the column in a tabular dataset - "description": "the volume of plasma cleared of a drug over a specified time period", - "units": "mL / (min g)", # units of the values in this column (leave empty if unitless) - "type": "continuous", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "drug clearance"}, - {"noun": "volume of plasma cleared over a specified time period"}, - ], - "uris": [ - "http://purl.bioontology.org/ontology/MEDDRA/10077254", - ], - }, - ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", - "description": "SMILES", # description (optional, except for "Other") - }, - { - "id": "chembl_id", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "names": [{"noun": "ChEMBL id"}, {"noun": "ChEMBL identifier number"}], - "description": "ChEMBL ids", - "sample": False, - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "http://dx.doi.org/10.6019/CHEMBL3301361", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1016/j.ejmech.2012.06.043", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/adme/#clearance-astrazeneca", - "description": "data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@techreport{Hersey2015, -doi = {10.6019/chembl3301361}, -url = {https://doi.org/10.6019/chembl3301361}, -year = {2015}, -month = feb, -publisher = {{EMBL}-{EBI}}, -author = {Anne Hersey}, -title = {{ChEMBL} Deposited Data Set - {AZ dataset}}""", - """@article{Di2012, -doi = {10.1016/j.ejmech.2012.06.043}, -url = {https://doi.org/10.1016/j.ejmech.2012.06.043}, -year = {2012}, -month = nov, -publisher = {Elsevier BV}, -volume = {57}, -pages = {441--448}, -author = {Li Di and Christopher Keefer and Dennis O. Scott and Timothy J. Strelevitz -and George Chang and Yi-An Bi and Yurong Lai and Jonathon Duckworth and -Katherine Fenner and Matthew D. Troutman and R. Scott Obach}, -title = {Mechanistic insights from comparing intrinsic clearance values between -human liver microsomes and hepatocytes to guide drug design}, -journal = {European Journal of Medicinal Chemistry}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") if __name__ == "__main__": diff --git a/data/tabular/clintox/meta.yaml b/data/tabular/clintox/meta.yaml index 282d3342c..d4302c3aa 100644 --- a/data/tabular/clintox/meta.yaml +++ b/data/tabular/clintox/meta.yaml @@ -90,13 +90,13 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {clinical_toxicity#not &NULL}be {clinical_toxicity__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {clinical_toxicity#not &NULL}{clinical_toxicity__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {clinical_toxicity__names__adjective}:{clinical_toxicity#no&yes} + - Is the {SMILES__description} {SMILES#} {clinical_toxicity__names__adjective}:{clinical_toxicity#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {clinical_toxicity__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{clinical_toxicity#False&True} + Result: {clinical_toxicity#False&True} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {clinical_toxicity__names__adjective}? @@ -110,7 +110,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {clinical_toxicity%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {clinical_toxicity#not &NULL}{clinical_toxicity__names__adjective}? @@ -124,4 +125,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%clinical_toxicity%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/clintox/transform.py b/data/tabular/clintox/transform.py index 5376d76a3..31e1bf3d0 100644 --- a/data/tabular/clintox/transform.py +++ b/data/tabular/clintox/transform.py @@ -188,8 +188,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/cyp2c9_substrate_carbonmangels/meta.yaml b/data/tabular/cyp2c9_substrate_carbonmangels/meta.yaml index 4e950d80f..580a752a3 100644 --- a/data/tabular/cyp2c9_substrate_carbonmangels/meta.yaml +++ b/data/tabular/cyp2c9_substrate_carbonmangels/meta.yaml @@ -113,13 +113,13 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {CYP2C9_Substrate#not &NULL}be a {CYP2C9_Substrate__names__noun}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {CYP2C9_Substrate#not &NULL}a {CYP2C9_Substrate__names__noun}: {SMILES#} - - Is the {SMILES__description} {SMILES#} a {CYP2C9_Substrate__names__noun}:{CYP2C9_Substrate#no&yes} + - Is the {SMILES__description} {SMILES#} a {CYP2C9_Substrate__names__noun}:{CYP2C9_Substrate#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {CYP2C9_Substrate__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{CYP2C9_Substrate#False&True} + Result: {CYP2C9_Substrate#False&True} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {CYP2C9_Substrate__names__verb}? @@ -133,7 +133,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {CYP2C9_Substrate%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {CYP2C9_Substrate#not &NULL}a {CYP2C9_Substrate__names__noun}? @@ -147,4 +148,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%CYP2C9_Substrate%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cyp2c9_substrate_carbonmangels/transform.py b/data/tabular/cyp2c9_substrate_carbonmangels/transform.py index 7bc96bee1..683150622 100644 --- a/data/tabular/cyp2c9_substrate_carbonmangels/transform.py +++ b/data/tabular/cyp2c9_substrate_carbonmangels/transform.py @@ -226,8 +226,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/cyp2d6_substrate_carbonmangels/meta.yaml b/data/tabular/cyp2d6_substrate_carbonmangels/meta.yaml index f7446332e..1304c9242 100644 --- a/data/tabular/cyp2d6_substrate_carbonmangels/meta.yaml +++ b/data/tabular/cyp2d6_substrate_carbonmangels/meta.yaml @@ -114,13 +114,13 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {CYP2D6_Substrate#not &NULL}be a {CYP2D6_Substrate__names__noun}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {CYP2D6_Substrate#not &NULL}a {CYP2D6_Substrate__names__noun}: {SMILES#} - - Is the {SMILES__description} {SMILES#} a {CYP2D6_Substrate__names__noun}:{CYP2D6_Substrate#no&yes} + - Is the {SMILES__description} {SMILES#} a {CYP2D6_Substrate__names__noun}:{CYP2D6_Substrate#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {CYP2D6_Substrate__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{CYP2D6_Substrate#False&True} + Result: {CYP2D6_Substrate#False&True} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {CYP2D6_Substrate__names__verb}? @@ -134,7 +134,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {CYP2D6_Substrate%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {CYP2D6_Substrate#not &NULL}a {CYP2D6_Substrate__names__noun}? @@ -148,4 +149,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%CYP2D6_Substrate%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cyp2d6_substrate_carbonmangels/transform.py b/data/tabular/cyp2d6_substrate_carbonmangels/transform.py index abfeffddf..ddfc881f8 100644 --- a/data/tabular/cyp2d6_substrate_carbonmangels/transform.py +++ b/data/tabular/cyp2d6_substrate_carbonmangels/transform.py @@ -227,8 +227,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/cyp3a4_substrate_carbonmangels/meta.yaml b/data/tabular/cyp3a4_substrate_carbonmangels/meta.yaml index 020c1e096..5243d10bb 100644 --- a/data/tabular/cyp3a4_substrate_carbonmangels/meta.yaml +++ b/data/tabular/cyp3a4_substrate_carbonmangels/meta.yaml @@ -115,13 +115,13 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {CYP3A4_Substrate#not &NULL}be a {CYP3A4_Substrate__names__noun}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {CYP3A4_Substrate#not &NULL}a {CYP3A4_Substrate__names__noun}: {SMILES#} - - Is the {SMILES__description} {SMILES#} a {CYP3A4_Substrate__names__noun}:{CYP3A4_Substrate#no&yes} + - Is the {SMILES__description} {SMILES#} a {CYP3A4_Substrate__names__noun}:{CYP3A4_Substrate#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {CYP3A4_Substrate__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{CYP3A4_Substrate#False&True} + Result: {CYP3A4_Substrate#False&True} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {CYP3A4_Substrate__names__verb}? @@ -135,7 +135,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {CYP3A4_Substrate%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {CYP3A4_Substrate#not &NULL}a {CYP3A4_Substrate__names__noun}? @@ -149,4 +150,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%CYP3A4_Substrate%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cyp3a4_substrate_carbonmangels/transform.py b/data/tabular/cyp3a4_substrate_carbonmangels/transform.py index 0b1ae2aa3..379f76e04 100644 --- a/data/tabular/cyp3a4_substrate_carbonmangels/transform.py +++ b/data/tabular/cyp3a4_substrate_carbonmangels/transform.py @@ -228,8 +228,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/cyp_p450_1a2_inhibition_veith_et_al/meta.yaml b/data/tabular/cyp_p450_1a2_inhibition_veith_et_al/meta.yaml index ae5056c6c..ed21a5653 100644 --- a/data/tabular/cyp_p450_1a2_inhibition_veith_et_al/meta.yaml +++ b/data/tabular/cyp_p450_1a2_inhibition_veith_et_al/meta.yaml @@ -96,19 +96,19 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {CYP1A2_inhibition#not &NULL}be {CYP1A2_inhibition__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {CYP1A2_inhibition#not &NULL}{CYP1A2_inhibition__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {CYP1A2_inhibition__names__gerund}:{CYP1A2_inhibition#no&yes} + - Is the {SMILES__description} {SMILES#} {CYP1A2_inhibition__names__gerund}:{CYP1A2_inhibition#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP1A2_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{CYP1A2_inhibition#False&True} + Result: {CYP1A2_inhibition#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP1A2_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {CYP1A2_inhibition#not &NULL}{CYP1A2_inhibition__names__gerund}. + Result:This molecule is {CYP1A2_inhibition#not &NULL}{CYP1A2_inhibition__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {CYP1A2_inhibition__names__gerund}? @@ -122,7 +122,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {CYP1A2_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {CYP1A2_inhibition#not &NULL}{CYP1A2_inhibition__names__gerund}? @@ -136,4 +137,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%CYP1A2_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cyp_p450_1a2_inhibition_veith_et_al/transform.py b/data/tabular/cyp_p450_1a2_inhibition_veith_et_al/transform.py index 54887d69e..5b9378e1f 100644 --- a/data/tabular/cyp_p450_1a2_inhibition_veith_et_al/transform.py +++ b/data/tabular/cyp_p450_1a2_inhibition_veith_et_al/transform.py @@ -207,8 +207,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/cyp_p450_2c19_inhibition_veith_et_al/meta.yaml b/data/tabular/cyp_p450_2c19_inhibition_veith_et_al/meta.yaml index 239aa54ed..0f736501f 100644 --- a/data/tabular/cyp_p450_2c19_inhibition_veith_et_al/meta.yaml +++ b/data/tabular/cyp_p450_2c19_inhibition_veith_et_al/meta.yaml @@ -96,19 +96,19 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {CYP2C19_inhibition#not &NULL}be {CYP2C19_inhibition__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {CYP2C19_inhibition#not &NULL}{CYP2C19_inhibition__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {CYP2C19_inhibition__names__gerund}:{CYP2C19_inhibition#no&yes} + - Is the {SMILES__description} {SMILES#} {CYP2C19_inhibition__names__gerund}:{CYP2C19_inhibition#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP2C19_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{CYP2C19_inhibition#False&True} + Result: {CYP2C19_inhibition#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP2C19_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {CYP2C19_inhibition#not &NULL}{CYP2C19_inhibition__names__gerund}. + Result:This molecule is {CYP2C19_inhibition#not &NULL}{CYP2C19_inhibition__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {CYP2C19_inhibition__names__gerund}? @@ -122,7 +122,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {CYP2C19_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {CYP2C19_inhibition#not &NULL}{CYP2C19_inhibition__names__gerund}? @@ -136,4 +137,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%CYP2C19_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cyp_p450_2c19_inhibition_veith_et_al/transform.py b/data/tabular/cyp_p450_2c19_inhibition_veith_et_al/transform.py index 86d582887..fe74465e1 100644 --- a/data/tabular/cyp_p450_2c19_inhibition_veith_et_al/transform.py +++ b/data/tabular/cyp_p450_2c19_inhibition_veith_et_al/transform.py @@ -207,8 +207,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/cyp_p450_2c9_inhibition_veith_et_al/meta.yaml b/data/tabular/cyp_p450_2c9_inhibition_veith_et_al/meta.yaml index 56889d1e7..79bfd1fc5 100644 --- a/data/tabular/cyp_p450_2c9_inhibition_veith_et_al/meta.yaml +++ b/data/tabular/cyp_p450_2c9_inhibition_veith_et_al/meta.yaml @@ -93,19 +93,19 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {CYP2C9_inhibition#not &NULL}be {CYP2C9_inhibition__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {CYP2C9_inhibition#not &NULL}{CYP2C9_inhibition__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {CYP2C9_inhibition__names__gerund}:{CYP2C9_inhibition#no&yes} + - Is the {SMILES__description} {SMILES#} {CYP2C9_inhibition__names__gerund}:{CYP2C9_inhibition#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP2C9_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{CYP2C9_inhibition#False&True} + Result: {CYP2C9_inhibition#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP2C9_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {CYP2C9_inhibition#not &NULL}{CYP2C9_inhibition__names__gerund}. + Result:This molecule is {CYP2C9_inhibition#not &NULL}{CYP2C9_inhibition__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {CYP2C9_inhibition__names__gerund}? @@ -119,7 +119,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {CYP2C9_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {CYP2C9_inhibition#not &NULL}{CYP2C9_inhibition__names__gerund}? @@ -133,4 +134,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%CYP2C9_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cyp_p450_2c9_inhibition_veith_et_al/transform.py b/data/tabular/cyp_p450_2c9_inhibition_veith_et_al/transform.py index 9ee86b2e9..363da2237 100644 --- a/data/tabular/cyp_p450_2c9_inhibition_veith_et_al/transform.py +++ b/data/tabular/cyp_p450_2c9_inhibition_veith_et_al/transform.py @@ -204,8 +204,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/cyp_p450_2d6_inhibition_veith_et_al/meta.yaml b/data/tabular/cyp_p450_2d6_inhibition_veith_et_al/meta.yaml index c1c5cb9a7..50f500f69 100644 --- a/data/tabular/cyp_p450_2d6_inhibition_veith_et_al/meta.yaml +++ b/data/tabular/cyp_p450_2d6_inhibition_veith_et_al/meta.yaml @@ -94,19 +94,19 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {CYP2D6_inhibition#not &NULL}be {CYP2D6_inhibition__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {CYP2D6_inhibition#not &NULL}{CYP2D6_inhibition__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {CYP2D6_inhibition__names__gerund}:{CYP2D6_inhibition#no&yes} + - Is the {SMILES__description} {SMILES#} {CYP2D6_inhibition__names__gerund}:{CYP2D6_inhibition#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP2D6_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{CYP2D6_inhibition#False&True} + Result: {CYP2D6_inhibition#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP2D6_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {CYP2D6_inhibition#not &NULL}{CYP2D6_inhibition__names__gerund}. + Result:This molecule is {CYP2D6_inhibition#not &NULL}{CYP2D6_inhibition__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {CYP2D6_inhibition__names__gerund}? @@ -120,7 +120,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {CYP2D6_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {CYP2D6_inhibition#not &NULL}{CYP2D6_inhibition__names__gerund}? @@ -134,4 +135,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%CYP2D6_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cyp_p450_2d6_inhibition_veith_et_al/transform.py b/data/tabular/cyp_p450_2d6_inhibition_veith_et_al/transform.py index a904e9c98..7f040d410 100644 --- a/data/tabular/cyp_p450_2d6_inhibition_veith_et_al/transform.py +++ b/data/tabular/cyp_p450_2d6_inhibition_veith_et_al/transform.py @@ -206,8 +206,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/cyp_p450_3a4_inhibition_veith_et_al/meta.yaml b/data/tabular/cyp_p450_3a4_inhibition_veith_et_al/meta.yaml index b17d5272d..fbd8449a6 100644 --- a/data/tabular/cyp_p450_3a4_inhibition_veith_et_al/meta.yaml +++ b/data/tabular/cyp_p450_3a4_inhibition_veith_et_al/meta.yaml @@ -95,19 +95,19 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {CYP3A4_inhibition#not &NULL}be {CYP3A4_inhibition__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {CYP3A4_inhibition#not &NULL}{CYP3A4_inhibition__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {CYP3A4_inhibition__names__gerund}:{CYP3A4_inhibition#no&yes} + - Is the {SMILES__description} {SMILES#} {CYP3A4_inhibition__names__gerund}:{CYP3A4_inhibition#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP3A4_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{CYP3A4_inhibition#False&True} + Result: {CYP3A4_inhibition#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {CYP3A4_inhibition__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {CYP3A4_inhibition#not &NULL}{CYP3A4_inhibition__names__gerund}. + Result:This molecule is {CYP3A4_inhibition#not &NULL}{CYP3A4_inhibition__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {CYP3A4_inhibition__names__gerund}? @@ -121,7 +121,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {CYP3A4_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {CYP3A4_inhibition#not &NULL}{CYP3A4_inhibition__names__gerund}? @@ -135,4 +136,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%CYP3A4_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/cyp_p450_3a4_inhibition_veith_et_al/transform.py b/data/tabular/cyp_p450_3a4_inhibition_veith_et_al/transform.py index 15522c946..ad9bfb2d4 100644 --- a/data/tabular/cyp_p450_3a4_inhibition_veith_et_al/transform.py +++ b/data/tabular/cyp_p450_3a4_inhibition_veith_et_al/transform.py @@ -207,8 +207,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/drug_induced_liver_injury/meta.yaml b/data/tabular/drug_induced_liver_injury/meta.yaml index 9338f7266..1b59b3b31 100644 --- a/data/tabular/drug_induced_liver_injury/meta.yaml +++ b/data/tabular/drug_induced_liver_injury/meta.yaml @@ -91,13 +91,13 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {liver_injury#not &NULL}be causing a {liver_injury__names__noun}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {liver_injury#not &NULL}causing a {liver_injury__names__noun}: {SMILES#} - - Is the {SMILES__description} {SMILES#} causing a {liver_injury__names__noun}:{liver_injury#no&yes} + - Is the {SMILES__description} {SMILES#} causing a {liver_injury__names__noun}:{liver_injury#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that {#shows|causes!} a {liver_injury__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{liver_injury#False&True} + Result: {liver_injury#False&True} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} causing a {liver_injury__names__noun}? @@ -111,7 +111,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {liver_injury%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {liver_injury#not &NULL} causing a {liver_injury__names__noun}? @@ -125,4 +126,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%liver_injury%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/drug_induced_liver_injury/transform.py b/data/tabular/drug_induced_liver_injury/transform.py index ea97bcf89..41e4fc09e 100644 --- a/data/tabular/drug_induced_liver_injury/transform.py +++ b/data/tabular/drug_induced_liver_injury/transform.py @@ -200,8 +200,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/drugchat_liang_zhang_et_al/meta.yaml b/data/tabular/drugchat_liang_zhang_et_al/meta.yaml index 8584e8186..c86be8e62 100644 --- a/data/tabular/drugchat_liang_zhang_et_al/meta.yaml +++ b/data/tabular/drugchat_liang_zhang_et_al/meta.yaml @@ -21,7 +21,7 @@ links: description: corresponding publication - url: https://github.com/UCSD-AI4H/drugchat description: rep & data source -num_points: 143,517 +num_points: 143517 bibtex: - |- @article{Liang2023, diff --git a/data/tabular/flashpoint/meta.yaml b/data/tabular/flashpoint/meta.yaml index 184fe3ba7..2486130ea 100644 --- a/data/tabular/flashpoint/meta.yaml +++ b/data/tabular/flashpoint/meta.yaml @@ -35,3 +35,50 @@ bibtex: pages={e1900101}, year={2020} }" + +templates: + - |- + {#Task: |Task: |!}{#Predict|Estimate!} the {flashpoint__names__noun} of {SMILES#}. + {#Answer: |A: |!}The {flashpoint__names__noun} is {flashpoint#} {flashpoint__units}. + - |- + {#Task: |Task: |!}{#Predict|Estimate!} the {flashpoint__names__noun} of a {#molecule|compound!} with the {SMILES__description} {SMILES#}? + {#Answer: |A: |!}{#The flashpoint point is |!}{flashpoint#} {flashpoint__units}. + - |- + {#Question: |Q: !}What is the {flashpoint__names__noun} of {SMILES#}? + {#Answer: |A: |!}{#The flashpoint point is |!}{flashpoint#} {flashpoint__units}. + - |- + {#Question: |Q: !}What is the {flashpoint__names__noun} of a {#molecule|compound!} with the {SMILES__description} {SMILES#}? + {#Answer: |A: |!}{#The flashpoint point is |!}{flashpoint#} {flashpoint__units}. + - |- + {#Question: |Q: !}What is a compound with a {flashpoint__names__noun} of {flashpoint#} {flashpoint__units}? + {#Answer: |A: |!}{SMILES#} + - |- + User: I have a question about {SMILES#}. + Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} + User: What is the {flashpoint__names__noun} of {#this compound|this molecule!}? + Assistant: {#The flashpoint is |!}{flashpoint#} {flashpoint__units}. + - |- + User: I have a question about a {#compound|molecule!} with the {SMILES__description} {SMILES#}. + Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} + User: What is the {flashpoint__names__noun} of {#this compound|this molecule!}? + Assistant: {#The flashpoint is |!}{flashpoint#} {flashpoint__units}. {#Is there anything else I can help you with?|Do you have any other questions?|Do you have any other questions for me?|Is there anything else I can help you with today?|Do you have any other questions for me today?!} + User: {#Yes,|Indeed,!} what is the name of {#this compound|this molecule!}? + Assistant: {SMILES#} + - |- + Task: Please estimate the {flashpoint__names__noun} of a compound. + Compound: {SMILES#} + Result:The flashpoint point{flashpoint#} {flashpoint__units} + - |- + Task: Please estimate the {flashpoint__names__noun} of a compound. + {SMILES__description}: {SMILES#} + Result:The flashpoint point{flashpoint#} {flashpoint__units} + - |- + Question: What is the {flashpoint__names__noun} of a compound with the {SMILES__description} {SMILES#} in {flashpoint__units}? + + Answer:The flashpoint point{flashpoint#} + - |- + Question: Which molecule has a {flashpoint__names__noun} of {flashpoint#} {flashpoint__units}? + Pick {%multiple_choice_enum%3%aA1}. + Options: + {SMILES%} + Answer: {%multiple_choice_result} diff --git a/data/tabular/freesolv/meta.yaml b/data/tabular/freesolv/meta.yaml index 04758b6d9..c0527fad0 100644 --- a/data/tabular/freesolv/meta.yaml +++ b/data/tabular/freesolv/meta.yaml @@ -64,13 +64,13 @@ templates: - "User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that has a {GAFF__names__noun} of {GAFF#} {GAFF__units}.\nAssistant: This is a molecule that has a {GAFF__names__noun} of {GAFF#} {GAFF__units}: {SMILES#}" - "User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}.\nAssistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}?\nUser: Yes, please. The molecule should have a {GAFF__names__noun} of {GAFF#} {GAFF__units}.\nAssistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} represents a molecule that has a {GAFF__names__noun} of {GAFF#} {GAFF__units}: {SMILES#}" - "User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}.\nAssistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}?\nUser: Yes, the molecule should have a {GAFF__names__noun} of {GAFF#} {GAFF__units}.\nAssistant: {#Understood|Got it|Ok!}, this {SMILES__description} represents a molecule that has a {GAFF__names__noun} of {GAFF#} {GAFF__units}: {SMILES#}" - - The {exp_value__names__noun} of the molecule with the {SMILES__description} {SMILES#} is:{exp_value#} {exp_value__units} - - The {exp_value__names__noun} of the {SMILES__description} {SMILES#} is:{exp_value#} {exp_value__units} - - The {exp_value__names__noun} of the molecule {SMILES__description} {SMILES#} is:{exp_value#} {exp_value__units} - - "Task: Please predict a molecule feature based on the description.\nDescription: Predict the {exp_value__names__noun} in {exp_value__units} of a molecule.\n{#Molecule |!}{SMILES__description}: {SMILES#}\nConstraint: Even if you are {#uncertain|not sure!}, you must answer with a numeric value in {exp_value__units} without using any {#other|additional!} words.\nResult:{exp_value#} {exp_value__units}" - - "Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}.\nDescription: A molecule that has {exp_value__names__noun} of {exp_value#} {exp_value__units}.\nResult:{SMILES#}" - - The {GAFF__names__noun} of the molecule with the {SMILES__description} {SMILES#} is:{GAFF#} {GAFF__units} - - The {GAFF__names__noun} of the {SMILES__description} {SMILES#} is:{GAFF#} {GAFF__units} - - The {GAFF__names__noun} of the molecule {SMILES__description} {SMILES#} is:{GAFF#} {GAFF__units} - - "Task: Please predict a molecule feature based on the description.\nDescription: Predict the {GAFF__names__noun} in {GAFF__units} of a molecule.\n{#Molecule |!}{SMILES__description}: {SMILES#}\nConstraint: Even if you are {#uncertain|not sure!}, you must answer with a numeric value in {GAFF__units} without using any {#other|additional!} words.\nResult:{GAFF#} {GAFF__units}" - - "Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}.\nDescription: A molecule that has {GAFF__names__noun} of {GAFF#} {GAFF__units}.\nResult:{SMILES#}" + - The {exp_value__names__noun} of the molecule with the {SMILES__description} {SMILES#} is:{exp_value#} {exp_value__units} + - The {exp_value__names__noun} of the {SMILES__description} {SMILES#} is:{exp_value#} {exp_value__units} + - The {exp_value__names__noun} of the molecule {SMILES__description} {SMILES#} is:{exp_value#} {exp_value__units} + - "Task: Please predict a molecule feature based on the description.\nDescription: Predict the {exp_value__names__noun} in {exp_value__units} of a molecule.\n{#Molecule |!}{SMILES__description}: {SMILES#}\nConstraint: Even if you are {#uncertain|not sure!}, you must answer with a numeric value in {exp_value__units} without using any {#other|additional!} words.\nResult: {exp_value#} {exp_value__units}" + - "Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}.\nDescription: A molecule that has {exp_value__names__noun} of {exp_value#} {exp_value__units}.\nResult: {SMILES#}" + - The {GAFF__names__noun} of the molecule with the {SMILES__description} {SMILES#} is:{GAFF#} {GAFF__units} + - The {GAFF__names__noun} of the {SMILES__description} {SMILES#} is:{GAFF#} {GAFF__units} + - The {GAFF__names__noun} of the molecule {SMILES__description} {SMILES#} is:{GAFF#} {GAFF__units} + - "Task: Please predict a molecule feature based on the description.\nDescription: Predict the {GAFF__names__noun} in {GAFF__units} of a molecule.\n{#Molecule |!}{SMILES__description}: {SMILES#}\nConstraint: Even if you are {#uncertain|not sure!}, you must answer with a numeric value in {GAFF__units} without using any {#other|additional!} words.\nResult: {GAFF#} {GAFF__units}" + - "Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}.\nDescription: A molecule that has {GAFF__names__noun} of {GAFF#} {GAFF__units}.\nResult: {SMILES#}" diff --git a/data/tabular/freesolv/transform.py b/data/tabular/freesolv/transform.py index 8a0860534..b02eac002 100644 --- a/data/tabular/freesolv/transform.py +++ b/data/tabular/freesolv/transform.py @@ -219,8 +219,8 @@ def get_and_transform_data(): ], } fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/h2_storage_materials/meta.yaml b/data/tabular/h2_storage_materials/meta.yaml index b06e215a4..8dca9d3d3 100644 --- a/data/tabular/h2_storage_materials/meta.yaml +++ b/data/tabular/h2_storage_materials/meta.yaml @@ -22,7 +22,7 @@ identifiers: names: - noun: chemical formula description: chemical formula - - id: synthetic_information + - id: synthesis_information names: - noun: synthesis procedure summary description: brief description of synthetic procedure @@ -33,6 +33,37 @@ links: description: website with source data - url: https://datahub.hymarc.org/dataset/ad580d95-e7e2-4ef4-a7f6-3b2f91a96eba/resource/4ef1c494-366e-43a3-bed4-a3985de5c374/download/hydstormatdb-reversible_hydrides.csv description: original_dataset -num_points: 30 +num_points: 5 bibtex: - - "@online{hymarcReversibleHydrides,\ntitle={Hydrogen Storage Materials Database Reversible Hydrides},\nauthor={HyMARC},\nyear={2019}" + - |- + @online{hymarcReversibleHydrides, + title={Hydrogen Storage Materials Database Reversible Hydrides}, + author={HyMARC}, + year={2019 + } +templates: + - |- + {#Task: Predict the theoretical hydrogen storage capacity of the following material.|Task: Estimate the theoretical hydrogen storage capacity of the following material.|Task: Calculate the theoretical hydrogen storage capacity of the following material.|Task: Determine the theoretical hydrogen storage capacity of the following material.|Task: Find the theoretical hydrogen storage capacity of the following material.|Task: Compute the theoretical hydrogen storage capacity of the following material.!} + {#Material: |Compound: |Hydride: |!}{material_name#} + {#Chemical formula: |Formula: |!}{chemical_formula#} + {#Theoretical hydrogen storage capacity: |Theoretical capacity: |!}{h_weight_density_theory#}{h_weight_density_theory__units}. + - |- + Question: What is the theoretical hydrogen storage capacity of the following material? + Description: {material_name__description} {material_name#} + Constraint: Return only the theoretical hydrogen storage capacity without any additional information. + Answer: {h_weight_density_theory#}{h_weight_density_theory__units}. + - |- + The theoretical hydrogen storage capacity of {material_name#} is {h_weight_density_theory#}{h_weight_density_theory__units}. + - |- + User: I need to know the theoretical hydrogen storage capacity of {material_name#}. + Assistant: The theoretical hydrogen storage capacity is {h_weight_density_theory#}{h_weight_density_theory__units}. + - |- + Task: Please answer the multiple choice question. + Question: Which materials have a theoretical hydrogen storage capacity of {h_weight_density_theory#}{h_weight_density_theory__units}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {material_name%} + Answer: {%multiple_choice_result} + - |- + Task: Describe the synthetic procedure for the following material: {material_name#} + Answer: {synthesis_information#} diff --git a/data/tabular/h2_storage_materials/processing.ipynb b/data/tabular/h2_storage_materials/processing.ipynb index 3095b0d51..2db4ed310 100644 --- a/data/tabular/h2_storage_materials/processing.ipynb +++ b/data/tabular/h2_storage_materials/processing.ipynb @@ -345,7 +345,7 @@ "outputs": [], "source": [ "with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)" + " #yaml.dump(meta, f, sort_keys=False)" ] }, { @@ -523,7 +523,7 @@ "\n", " fn_meta = \"meta.yaml\"\n", " with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)\n", + " #yaml.dump(meta, f, sort_keys=False)\n", "\n", " print(f\"Finished processing {meta['name']} dataset!\")\n", "\n", diff --git a/data/tabular/h2_storage_materials/transform.py b/data/tabular/h2_storage_materials/transform.py index 249f30f97..8d7998b34 100644 --- a/data/tabular/h2_storage_materials/transform.py +++ b/data/tabular/h2_storage_materials/transform.py @@ -70,84 +70,10 @@ def get_and_transform_data(): df[string_columns] = df[string_columns].apply(lambda x: x.str.strip()) fn_data_csv = "data_clean.csv" + # dropna + df = df.dropna() df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "h2_storage_reversible_hydrides", # unique identifier, we will also use this for directory names - "description": "synthetic procedures, experimental and theoretical h2 capacities of hydrides", - "targets": [ - { - "id": "h_weight_density_theory", # name of the column in a tabular dataset - "description": "theoretical hydrogen storage capacity", # description of what this column means - "units": "wt%", # units of the values in this column (leave empty if unitless) - "type": "continuous", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "theoretical hydrogen storage weight density"}, - ], - }, - { - "id": "h_weight_density_experiment", # name of the column in a tabular dataset - "description": "experimental hydrogen storage capacity", # description of what this column means - "units": "wt%", # units of the values in this column (leave empty if unitless) - "type": "continuous", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "experimental hydrogen storage capacity"} - ], - }, - ], - "identifiers": [ - { - "id": "material_name", # column name - "type": "IUPAC", # can be "SMILES", "SELFIES", "IUPAC", "OTHER" - "description": "chemical name", # description (optional, except for "OTHER") - }, - { - "id": "chemical_formula", - "type": "Other", - "names": [{"noun": "chemical formula"}], - "description": "chemical formula", - }, - { - "id": "synthetic_information", # name of the column in a tabular dataset - "names": [{"noun": "synthesis procedure summary"}], - "description": "brief description of synthetic procedure", # description of what this column means - "type": "Other", - }, - ], - "license": "File", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": ( - "https://datahub.hymarc.org/dataset/" - "hydrogen-storage-materials-db/resource/4ef1c494-366e-43a3-bed4-a3985de5c374" - ), - "description": "website with source data", - }, - { - "url": ( - "https://datahub.hymarc.org/dataset/" - "ad580d95-e7e2-4ef4-a7f6-3b2f91a96eba/resource/" - "4ef1c494-366e-43a3-bed4-a3985de5c374/download/hydstormatdb-reversible_hydrides.csv" - ), - "description": "original_dataset", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@online{hymarcReversibleHydrides, -title={Hydrogen Storage Materials Database Reversible Hydrides}, -author={HyMARC}, -year={2019}""", - ], - } - - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/half_life_obach/meta.yaml b/data/tabular/half_life_obach/meta.yaml index 161da87c4..68e8e9e40 100644 --- a/data/tabular/half_life_obach/meta.yaml +++ b/data/tabular/half_life_obach/meta.yaml @@ -4,51 +4,130 @@ description: |- in the body to be reduced by half. It measures the duration of actions of a drug. This dataset deposited version under CHEMBL assay 1614674. targets: - - id: half_life_duration - description: the time it takes for the plasma concentration of a drug in the body to be reduced by half - units: hours - type: continuous - significant_digits: 2 - names: - - noun: half life in humans after IV administration - - noun: half life time in humans after IV administration - - noun: drug half life time in humans after IV administration - uris: - - http://purl.bioontology.org/ontology/MESH/D006207 +- id: half_life_duration + description: the time it takes for the plasma concentration of a drug in the body + to be reduced by half + units: hours + type: continuous + names: + - noun: half life in humans after IV administration + - noun: half life time in humans after IV administration + - noun: drug half life time in humans after IV administration + uris: + - http://purl.bioontology.org/ontology/MESH/D006207 benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: SMILES - type: SMILES - description: SMILES - - id: chembl_id - type: Other - names: - - noun: ChEMBL database id - - noun: ChEMBL identifier number - description: ChEMBL ids - sample: false +- id: SMILES + type: SMILES + description: SMILES +- id: chembl_id + type: Other + names: + - noun: ChEMBL database id + - noun: ChEMBL identifier number + description: ChEMBL ids + sample: false license: CC BY 4.0 links: - - url: https://doi.org/10.1124/dmd.108.020479 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/adme/#half-life-obach-et-al - description: data source +- url: https://doi.org/10.1124/dmd.108.020479 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/adme/#half-life-obach-et-al + description: data source num_points: 667 bibtex: +- |- + @article{Obach2008, + doi = {10.1124/dmd.108.020479}, + url = {https://doi.org/10.1124/dmd.108.020479}, + year = {2008}, + month = apr, + publisher = {American Society for Pharmacology and Experimental Therapeutics (ASPET)}, + volume = {36}, + number = {7}, + pages = {1385--1405}, + author = {R. Scott Obach and Franco Lombardo and Nigel J. Waters}, + title = {Trend Analysis of a Database of Intravenous Pharmacokinetic + Parameters in Humans for 670 Drug Compounds}, + journal = {Drug Metabolism and Disposition} +templates: + - The molecule with the {SMILES__description} {#representation of |!}{SMILES#} has a {half_life_duration__names__noun} of {half_life_duration#} {half_life_duration__units}. + - Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule has a {half_life_duration__names__noun} of {half_life_duration#} {half_life_duration__units}. + - The {SMILES__description} {SMILES#} {#represents|is from!} a molecule with a {half_life_duration__names__noun} of {half_life_duration#} {half_life_duration__units}. + - The {#molecule |!}{SMILES__description} {SMILES#} has a {half_life_duration__names__noun} of {half_life_duration#} {half_life_duration__units}. - |- - @article{Obach2008, - doi = {10.1124/dmd.108.020479}, - url = {https://doi.org/10.1124/dmd.108.020479}, - year = {2008}, - month = apr, - publisher = {American Society for Pharmacology and Experimental Therapeutics (ASPET)}, - volume = {36}, - number = {7}, - pages = {1385--1405}, - author = {R. Scott Obach and Franco Lombardo and Nigel J. Waters}, - title = {Trend Analysis of a Database of Intravenous Pharmacokinetic - Parameters in Humans for 670 Drug Compounds}, - journal = {Drug Metabolism and Disposition} + Task: Please predict a property for a molecule based on the description. + Description: Predict the {half_life_duration__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: You must provide a numerical estimate in units of {half_life_duration__units}. + Result: {half_life_duration#} + - |- + Task: Please predict a property for a molecule based on the description. + Description: Predict the {half_life_duration__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Answer the question in a {#full|complete!} sentence. + Result: This molecule has a {half_life_duration__names__noun} of {half_life_duration#} {half_life_duration__units}. + - |- + Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. + Description: A molecule with a {half_life_duration__names__noun} of approximately {half_life_duration#} {half_life_duration__units}. + Result: {SMILES#} + - |- + User: Can you {#tell me|derive|estimate!} the {half_life_duration__names__noun} for the molecule with the {SMILES__description} {SMILES#}? + Assistant: Based on my analysis, the {half_life_duration__names__noun} for this molecule is approximately {half_life_duration#} {half_life_duration__units}. + - |- + User: What is the {half_life_duration__names__noun} value for the molecule with the {SMILES__description} {SMILES#}? + Assistant: The {half_life_duration__names__noun} value for this molecule is approximately {half_life_duration#} {half_life_duration__units}. + - |- + User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that has a {half_life_duration__names__noun} value of approximately {half_life_duration#} {half_life_duration__units}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} + - |- + User: I'm {#searching|looking!} for the {SMILES__description} of a molecule with a {half_life_duration__names__noun} value close to {half_life_duration#} {half_life_duration__units}. + Assistant: This molecule has a {half_life_duration__names__noun} value of approximately {half_life_duration#} {half_life_duration__units}: {SMILES#} + - |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? + User: Yes, please. The molecule should have a {half_life_duration__names__noun} value of approximately {half_life_duration#} {half_life_duration__units}. + Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} has a {half_life_duration__names__noun} value of approximately {half_life_duration#} {half_life_duration__units}: {SMILES#} + - |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? + User: Yes, the molecule should have a {half_life_duration__names__noun} value close to {half_life_duration#} {half_life_duration__units}. + Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} has a {half_life_duration__names__noun} value of approximately {half_life_duration#} {half_life_duration__units}: {SMILES#} + - What is the {half_life_duration__names__noun} value for the {SMILES__description} {SMILES#}:{half_life_duration#} + - |- + Task: Please predict a property for a molecule based on the description. + Description: Predict the {half_life_duration__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: You must provide a numerical estimate in units of {half_life_duration__units}. + Result: {half_life_duration#} + - |- + Task: Please answer the multiple choice question. + Question: Which molecule has the longest {half_life_duration__names__noun}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%half_life_duration%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Which molecule has the longest {half_life_duration__names__noun}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%half_life_duration%} + Answer: {%multiple_choice_result} + + - |- + Task: Please answer the multiple choice question. + Question: Rank these molecules from shortest to longest {half_life_duration__names__noun}. + Constraint: You must select all options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%half_life_duration%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Rank these molecules from shortest to longest {half_life_duration__names__noun}. + Constraint: You must select all options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%half_life_duration%} + Answer: {%multiple_choice_result} diff --git a/data/tabular/half_life_obach/transform.py b/data/tabular/half_life_obach/transform.py index 7b2d98e5d..6eff4c63d 100644 --- a/data/tabular/half_life_obach/transform.py +++ b/data/tabular/half_life_obach/transform.py @@ -42,100 +42,6 @@ def get_and_transform_data(): # save to csv fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - meta = { - "name": "half_life_obach", # unique identifier, we will also use this for directory names - "description": """Half life of a drug is the duration for the concentration of the drug -in the body to be reduced by half. It measures the duration of actions of a drug. -This dataset deposited version under CHEMBL assay 1614674.""", - "targets": [ - { - "id": "half_life_duration", # name of the column in a tabular dataset - "description": "the time it takes for the plasma concentration of a drug in the body to be reduced by half", # noqa: E501 - "units": "hours", # units of the values in this column (leave empty if unitless) - "type": "continuous", - "names": [ # names for the property (to sample from for building the prompts) - {"noun": "half life in humans after IV administration"}, - {"noun": "half life time in humans after IV administration"}, - {"noun": "drug half life time in humans after IV administration"}, - # { - # "noun": "the duration by which the concentration of the drug in the body is reduced by half" - # }, - ], - "uris": [ - "http://purl.bioontology.org/ontology/MESH/D006207", - ], - }, - ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - { - "id": "chembl_id", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "names": [ - {"noun": "ChEMBL database id"}, - {"noun": "ChEMBL identifier number"}, - ], - "description": "ChEMBL ids", # description (optional, except for "Other") - "sample": False, - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1124/dmd.108.020479", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/adme/#half-life-obach-et-al", - "description": "data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Obach2008, -doi = {10.1124/dmd.108.020479}, -url = {https://doi.org/10.1124/dmd.108.020479}, -year = {2008}, -month = apr, -publisher = {American Society for Pharmacology and Experimental Therapeutics (ASPET)}, -volume = {36}, -number = {7}, -pages = {1385--1405}, -author = {R. Scott Obach and Franco Lombardo and Nigel J. Waters}, -title = {Trend Analysis of a Database of Intravenous Pharmacokinetic -Parameters in Humans for 670 Drug Compounds}, -journal = {Drug Metabolism and Disposition}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") if __name__ == "__main__": diff --git a/data/tabular/herg_blockers/meta.yaml b/data/tabular/herg_blockers/meta.yaml index 3d998c759..a32ebc678 100644 --- a/data/tabular/herg_blockers/meta.yaml +++ b/data/tabular/herg_blockers/meta.yaml @@ -98,26 +98,27 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {herg_blocker#not &NULL}be a {herg_blocker__names__noun}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {herg_blocker#not &NULL}a {herg_blocker__names__noun}: {SMILES#} - - Is the {SMILES__description} {SMILES#} a {herg_blocker__names__noun}:{herg_blocker#no&yes} + - Is the {SMILES__description} {SMILES#} a {herg_blocker__names__noun}:{herg_blocker#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {herg_blocker__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{herg_blocker#False&True} + Result: {herg_blocker#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {herg_blocker__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {herg_blocker#not &NULL}a {herg_blocker__names__noun}. + Result:This molecule is {herg_blocker#not &NULL}a {herg_blocker__names__noun}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a {herg_blocker__names__noun}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {herg_blocker%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {herg_blocker#not &NULL}a {herg_blocker__names__noun}? @@ -131,4 +132,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%herg_blocker%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/herg_blockers/transform.py b/data/tabular/herg_blockers/transform.py index 9967270ea..58d5b0a19 100644 --- a/data/tabular/herg_blockers/transform.py +++ b/data/tabular/herg_blockers/transform.py @@ -224,8 +224,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/herg_central_at_10uM/meta.yaml b/data/tabular/herg_central_at_10uM/meta.yaml index 3a72f4ad6..5f9ff223b 100644 --- a/data/tabular/herg_central_at_10uM/meta.yaml +++ b/data/tabular/herg_central_at_10uM/meta.yaml @@ -28,4 +28,37 @@ links: description: Data source num_points: 306893 bibtex: - - "@article{Du2011,\ndoi = {10.1089/adt.2011.0425},\nurl = {https://doi.org/10.1089/adt.2011.0425},\nyear = {2011},\nmonth = dec,\npublisher = {Mary Ann Liebert Inc},\nvolume = {9},\nnumber = {6},\npages = {580--588},\nauthor = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock\nand Shunyou Long and Min Li},\ntitle = {hERGCentral: A Large Database to Store, Retrieve, and Analyze Compound Human\nEther-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development},\njournal = {ASSAY and Drug Development Technologies}" + - |- + @article{Du2011, + doi = {10.1089/adt.2011.0425}, + url = {https://doi.org/10.1089/adt.2011.0425}, + year = {2011}, + month = dec, + publisher = {Mary Ann Liebert Inc}, + volume = {9}, + number = {6}, + pages = {580--588}, + author = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock\nand Shunyou Long and Min Li}, + title = {hERGCentral: A Large Database to Store, Retrieve, and Analyze Compound Human\nEther-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development}, + journal = {ASSAY and Drug Development Technologies} + } +templates: + - The {herg_central_at_10uM__names__noun} of a {#drug|compound!} with the {SMILES__description} {SMILES#} is {herg_central_at_10uM#}{herg_central_at_10uM__units}. + - A {#drug|compound!} with the {SMILES__description} {SMILES#} has a {herg_central_at_10uM__names__noun} of {herg_central_at_10uM#}{herg_central_at_10uM__units}. + - |- + {#Task|Problem statement!}: Answer the {#multiple choice|multiple-choice|MCQ!} question. + {#Question|Query!}: Which {#compound|drug!} {#has|exhibits!} a {herg_central_at_10uM#}{herg_central_at_10uM__units} {herg_central_at_10uM__names__noun}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%herg_central_at_10uM%} + Answer: {%multiple_choice_result}. {SMILES#} + - |- + User: {#I need|I want!} to know the {herg_central_at_10uM__names__noun} of a {#compound|drug!} with the {SMILES__description} {SMILES#}. + Assistant: The {herg_central_at_10uM__names__noun} is {herg_central_at_10uM#}{herg_central_at_10uM__units}. + - |- + {#Task|Problem statement!}: Answer the {#multiple choice|multiple-choice|MCQ!} question. + {#Question|Query!}: What is the {herg_central_at_10uM__names__noun} of a {#compound|drug!} with the {SMILES__description} {SMILES#}? + Constraint: You must return none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {herg_central_at_10uM%SMILES%} + Answer: {%multiple_choice_result}. {herg_central_at_10uM#}{herg_central_at_10uM__units}. diff --git a/data/tabular/herg_central_at_10uM/transform.py b/data/tabular/herg_central_at_10uM/transform.py index d2870a332..b20d23e64 100644 --- a/data/tabular/herg_central_at_10uM/transform.py +++ b/data/tabular/herg_central_at_10uM/transform.py @@ -47,99 +47,5 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "herg_central_at_10uM", - "description": """Human ether-à-go-go related gene (hERG) is crucial for the coordination -of the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe -adverse effects. Therefore, reliable prediction of hERG liability in the early -stages of drug design is quite important to reduce the risk of cardiotoxicity-related -attritions in the later development stages. There are three targets: hERG_at_1microM, -hERG_at_10microM, and herg_inhib.""", - "targets": [ - { - "id": "herg_central_at_10uM", - "description": "the percent inhibition of hERG at a 10uM concentration", - "units": "%", - "type": "continuous", - "names": [ - {"noun": "hERG inhibition at a concentration of 10uM"}, - {"noun": "hERG inhibition at a concentration of 10uM"}, - {"noun": "hERG inhibition at 10uM"}, - { - "noun": "human ether-à-go-go related gene (hERG) inhibition at a concentration of 10uM" - }, - { - "noun": "human ether-à-go-go related gene (hERG) inhibition at 10uM" - }, - { - "noun": "human ether-à-go-go related gene (hERG) inhibition at 10uM" - }, - ], - "uris": [ - "http://purl.obolibrary.org/obo/MI_2136", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1089/adt.2011.0425", - "description": "corresponding publication", - }, - { - "url": "https://bbirnbaum.com/", - "description": "TDC Contributor", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/tox/#herg-central", - "description": "Data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Du2011, -doi = {10.1089/adt.2011.0425}, -url = {https://doi.org/10.1089/adt.2011.0425}, -year = {2011}, -month = dec, -publisher = {Mary Ann Liebert Inc}, -volume = {9}, -number = {6}, -pages = {580--588}, -author = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock -and Shunyou Long and Min Li}, -title = {hERGCentral: A Large Database to Store, Retrieve, and Analyze Compound Human -Ether-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development}, -journal = {ASSAY and Drug Development Technologies}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/herg_central_at_1uM/meta.yaml b/data/tabular/herg_central_at_1uM/meta.yaml index 2817fd315..8934c0a67 100644 --- a/data/tabular/herg_central_at_1uM/meta.yaml +++ b/data/tabular/herg_central_at_1uM/meta.yaml @@ -3,7 +3,7 @@ description: "Human ether-à-go-go related gene (hERG) is crucial for the coordi targets: - id: herg_central_at_1uM description: the percent inhibition of hERG at a 1uM concentration - units: "%" + units: \% type: continuous names: - noun: hERG inhibition at a concentration of 1uM @@ -28,4 +28,37 @@ links: description: Data source num_points: 306893 bibtex: - - "@article{Du2011,\ndoi = {10.1089/adt.2011.0425},\nurl = {https://doi.org/10.1089/adt.2011.0425},\nyear = {2011},\nmonth = dec,\npublisher = {Mary Ann Liebert Inc},\nvolume = {9},\nnumber = {6},\npages = {580--588},\nauthor = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock\nand Shunyou Long and Min Li},\ntitle = {hERGCentral: A Large Database to Store, Retrieve, and Analyze Compound Human\nEther-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development},\njournal = {ASSAY and Drug Development Technologies}" + - |- + @article{Du2011, + doi = {10.1089/adt.2011.0425}, + url = {https://doi.org/10.1089/adt.2011.0425}, + year = {2011}, + month = dec, + publisher = {Mary Ann Liebert Inc}, + volume = {9}, + number = {6}, + pages = {580--588}, + author = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock\nand Shunyou Long and Min Li}, + title = {hERGCentral: A Large Database to Store, Retrieve, and Analyze Compound Human\nEther-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development}, + journal = {ASSAY and Drug Development Technologies} + } +templates: + - The {herg_central_at_1uM__names__noun} of a {#drug|compound!} with the {SMILES__description} {SMILES#} is {herg_central_at_1uM#}{herg_central_at_1uM__units}. + - A {#drug|compound!} with the {SMILES__description} {SMILES#} has a {herg_central_at_1uM__names__noun} of {herg_central_at_1uM#}{herg_central_at_1uM__units}. + - |- + {#Task|Problem statement!}: Answer the {#multiple choice|multiple-choice|MCQ!} question. + {#Question|Query!}: Which {#compound|drug!} {#has|exhibits!} a {herg_central_at_1uM#}{herg_central_at_1uM__units} {herg_central_at_1uM__names__noun}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%herg_central_at_1uM%} + Answer: {%multiple_choice_result}. {SMILES#} + - |- + User: {#I need|I want!} to know the {herg_central_at_1uM__names__noun} of a {#compound|drug!} with the {SMILES__description} {SMILES#}. + Assistant: The {herg_central_at_1uM__names__noun} is {herg_central_at_1uM#}{herg_central_at_1uM__units}. + - |- + {#Task|Problem statement!}: Answer the {#multiple choice|multiple-choice|MCQ!} question. + {#Question|Query!}: What is the {herg_central_at_1uM__names__noun} of a {#compound|drug!} with the {SMILES__description} {SMILES#}? + Constraint: You must return none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {herg_central_at_1uM%SMILES%} + Answer: {%multiple_choice_result}. {herg_central_at_1uM#}{herg_central_at_1uM__units}. diff --git a/data/tabular/herg_central_at_1uM/transform.py b/data/tabular/herg_central_at_1uM/transform.py index 7fc731cc5..8f3b1570b 100644 --- a/data/tabular/herg_central_at_1uM/transform.py +++ b/data/tabular/herg_central_at_1uM/transform.py @@ -47,99 +47,5 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "herg_central_at_1uM", - "description": """Human ether-à-go-go related gene (hERG) is crucial for the coordination -of the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe -adverse effects. Therefore, reliable prediction of hERG liability in the early -stages of drug design is quite important to reduce the risk of cardiotoxicity-related -attritions in the later development stages. There are three targets: hERG_at_1microM, -hERG_at_10microM, and herg_inhib.""", - "targets": [ - { - "id": "herg_central_at_1uM", - "description": "the percent inhibition of hERG at a 1uM concentration", - "units": "%", - "type": "continuous", - "names": [ - {"noun": "hERG inhibition at a concentration of 1uM"}, - {"noun": "hERG inhibition at a concentration of 1uM"}, - {"noun": "hERG inhibition at 1uM"}, - { - "noun": "human ether-à-go-go related gene (hERG) inhibition at a concentration of 1uM" - }, - { - "noun": "human ether-à-go-go related gene (hERG) inhibition at 1uM" - }, - { - "noun": "human ether-à-go-go related gene (hERG) inhibition at 1uM" - }, - ], - "uris": [ - "http://purl.obolibrary.org/obo/MI_2136", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1089/adt.2011.0425", - "description": "corresponding publication", - }, - { - "url": "https://bbirnbaum.com/", - "description": "TDC Contributer", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/tox/#herg-central", - "description": "Data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Du2011, -doi = {10.1089/adt.2011.0425}, -url = {https://doi.org/10.1089/adt.2011.0425}, -year = {2011}, -month = dec, -publisher = {Mary Ann Liebert Inc}, -volume = {9}, -number = {6}, -pages = {580--588}, -author = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock -and Shunyou Long and Min Li}, -title = {hERGCentral: A Large Database to Store, Retrieve, and Analyze Compound Human -Ether-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development}, -journal = {ASSAY and Drug Development Technologies}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/herg_central_inhib/meta.yaml b/data/tabular/herg_central_inhib/meta.yaml index 8c4514045..38f6fce0e 100644 --- a/data/tabular/herg_central_inhib/meta.yaml +++ b/data/tabular/herg_central_inhib/meta.yaml @@ -79,26 +79,27 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {herg_inhib#not &NULL}be a {herg_inhib__names__noun}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {herg_inhib#not &NULL}a {herg_inhib__names__noun}: {SMILES#} - - Is the {SMILES__description} {SMILES#} a {herg_inhib__names__noun}:{herg_inhib#no&yes} + - Is the {SMILES__description} {SMILES#} a {herg_inhib__names__noun}:{herg_inhib#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {herg_inhib__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{herg_inhib#False&True} + Result: {herg_inhib#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {herg_inhib__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {herg_inhib#not &NULL}a {herg_inhib__names__noun}. + Result:This molecule is {herg_inhib#not &NULL}a {herg_inhib__names__noun}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a {herg_inhib__names__noun}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {herg_inhib%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {herg_inhib#not &NULL}a {herg_inhib__names__noun}? @@ -112,4 +113,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%herg_inhib%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/herg_central_inhib/transform.py b/data/tabular/herg_central_inhib/transform.py index f7c0a4305..7cc98762d 100644 --- a/data/tabular/herg_central_inhib/transform.py +++ b/data/tabular/herg_central_inhib/transform.py @@ -205,8 +205,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/herg_karim_et_al/meta.yaml b/data/tabular/herg_karim_et_al/meta.yaml index 0220393be..036ab93bf 100644 --- a/data/tabular/herg_karim_et_al/meta.yaml +++ b/data/tabular/herg_karim_et_al/meta.yaml @@ -86,26 +86,27 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {herg_blocker#not &NULL}be a {herg_blocker__names__noun}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {herg_blocker#not &NULL}a {herg_blocker__names__noun}: {SMILES#} - - Is the {SMILES__description} {SMILES#} a {herg_blocker__names__noun}:{herg_blocker#no&yes} + - Is the {SMILES__description} {SMILES#} a {herg_blocker__names__noun}:{herg_blocker#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {herg_blocker__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{herg_blocker#False&True} + Result: {herg_blocker#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is a {herg_blocker__names__noun}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {herg_blocker#not &NULL}a {herg_blocker__names__noun}. + Result:This molecule is {herg_blocker#not &NULL}a {herg_blocker__names__noun}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} a {herg_blocker__names__noun}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {herg_blocker%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {herg_blocker#not &NULL}a {herg_blocker__names__noun}? @@ -119,4 +120,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%herg_blocker%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/herg_karim_et_al/transform.py b/data/tabular/herg_karim_et_al/transform.py index 62aa05929..c091276d9 100644 --- a/data/tabular/herg_karim_et_al/transform.py +++ b/data/tabular/herg_karim_et_al/transform.py @@ -200,8 +200,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/hiv/meta.yaml b/data/tabular/hiv/meta.yaml index a94119ea7..e5a8f9f62 100644 --- a/data/tabular/hiv/meta.yaml +++ b/data/tabular/hiv/meta.yaml @@ -87,26 +87,27 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_HIV#not &NULL}be {activity_HIV__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_HIV#not &NULL}{activity_HIV__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_HIV__names__adjective}:{activity_HIV#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_HIV__names__adjective}:{activity_HIV#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_HIV__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_HIV#False&True} + Result: {activity_HIV#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_HIV__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {activity_HIV#not &NULL}{activity_HIV__names__adjective}. + Result:This molecule is {activity_HIV#not &NULL}{activity_HIV__names__adjective}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_HIV__names__adjective}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_HIV%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_HIV#not &NULL}{activity_HIV__names__adjective}? @@ -120,4 +121,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%activity_HIV%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/hiv/transform.py b/data/tabular/hiv/transform.py index e6f32a7db..5aedc69e3 100644 --- a/data/tabular/hiv/transform.py +++ b/data/tabular/hiv/transform.py @@ -202,8 +202,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/human_intestinal_absorption/meta.yaml b/data/tabular/human_intestinal_absorption/meta.yaml index 99e2a9234..48419528a 100644 --- a/data/tabular/human_intestinal_absorption/meta.yaml +++ b/data/tabular/human_intestinal_absorption/meta.yaml @@ -96,13 +96,13 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {absorption_HIA_Hou#not &NULL}be {absorption_HIA_Hou__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {absorption_HIA_Hou#not &NULL}{absorption_HIA_Hou__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {absorption_HIA_Hou__names__adjective}:{absorption_HIA_Hou#no&yes} + - Is the {SMILES__description} {SMILES#} {absorption_HIA_Hou__names__adjective}:{absorption_HIA_Hou#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {absorption_HIA_Hou__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{absorption_HIA_Hou#False&True} + Result: {absorption_HIA_Hou#False&True} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {absorption_HIA_Hou__names__adjective}? @@ -116,7 +116,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {absorption_HIA_Hou%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {absorption_HIA_Hou#not &NULL}{absorption_HIA_Hou__names__adjective}? @@ -130,4 +131,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%absorption_HIA_Hou%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/human_intestinal_absorption/transform.py b/data/tabular/human_intestinal_absorption/transform.py index 8d363cc44..82773e98a 100644 --- a/data/tabular/human_intestinal_absorption/transform.py +++ b/data/tabular/human_intestinal_absorption/transform.py @@ -212,8 +212,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/iupac_goldbook/meta.yaml b/data/tabular/iupac_goldbook/meta.yaml index 423db0011..9ad053804 100644 --- a/data/tabular/iupac_goldbook/meta.yaml +++ b/data/tabular/iupac_goldbook/meta.yaml @@ -87,8 +87,8 @@ templates: Task: Please {#give me|create|generate!} a {definition__names__noun} of a {term__names__noun}. Term: {term#} Constraint: Answer the question with {#full|complete!} sentences. - Result:{definition#} + Result: {definition#} - |- Task: Please {#give me|create|generate!} a {term__names__noun} for the {#following |!}{definition__names__noun}: Definition: {definition#} - Result:{term#} + Result: {term#} diff --git a/data/tabular/iupac_goldbook/transform.py b/data/tabular/iupac_goldbook/transform.py index 99b94fef7..92e1992d1 100644 --- a/data/tabular/iupac_goldbook/transform.py +++ b/data/tabular/iupac_goldbook/transform.py @@ -189,8 +189,8 @@ def str_presenter(dumper, data): ) # to use with safe_dump fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/kcnq2_potassium_channel_butkiewicz/meta.yaml b/data/tabular/kcnq2_potassium_channel_butkiewicz/meta.yaml index d5e61b49a..1de59f71e 100644 --- a/data/tabular/kcnq2_potassium_channel_butkiewicz/meta.yaml +++ b/data/tabular/kcnq2_potassium_channel_butkiewicz/meta.yaml @@ -133,26 +133,27 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_kcnq2_potassium_channel#not &NULL}be {activity_kcnq2_potassium_channel__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_kcnq2_potassium_channel#not &NULL}{activity_kcnq2_potassium_channel__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_kcnq2_potassium_channel__names__gerund}:{activity_kcnq2_potassium_channel#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_kcnq2_potassium_channel__names__gerund}:{activity_kcnq2_potassium_channel#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_kcnq2_potassium_channel__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_kcnq2_potassium_channel#False&True} + Result: {activity_kcnq2_potassium_channel#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_kcnq2_potassium_channel__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {activity_kcnq2_potassium_channel#not &NULL}{activity_kcnq2_potassium_channel__names__gerund}. + Result:This molecule is {activity_kcnq2_potassium_channel#not &NULL}{activity_kcnq2_potassium_channel__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_kcnq2_potassium_channel__names__gerund}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_kcnq2_potassium_channel%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_kcnq2_potassium_channel#not &NULL}{activity_kcnq2_potassium_channel__names__gerund}? @@ -166,4 +167,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%activity_kcnq2_potassium_channel%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/kcnq2_potassium_channel_butkiewicz/transform.py b/data/tabular/kcnq2_potassium_channel_butkiewicz/transform.py index e3ecfc95d..fd85e8e0a 100644 --- a/data/tabular/kcnq2_potassium_channel_butkiewicz/transform.py +++ b/data/tabular/kcnq2_potassium_channel_butkiewicz/transform.py @@ -231,8 +231,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/ld50_zhu/example_processing_and_templates.ipynb b/data/tabular/ld50_zhu/example_processing_and_templates.ipynb index c2a09dcac..de17e8360 100644 --- a/data/tabular/ld50_zhu/example_processing_and_templates.ipynb +++ b/data/tabular/ld50_zhu/example_processing_and_templates.ipynb @@ -934,7 +934,7 @@ "outputs": [], "source": [ "with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)" + " #yaml.dump(meta, f, sort_keys=False)" ] }, { @@ -1171,7 +1171,7 @@ " ) # to use with safe_dum\n", " fn_meta = \"meta.yaml\"\n", " with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)\n", + " #yaml.dump(meta, f, sort_keys=False)\n", "\n", " print(f\"Finished processing {meta['name']} dataset!\")\n", "\n", diff --git a/data/tabular/ld50_zhu/meta.yaml b/data/tabular/ld50_zhu/meta.yaml index 7f77efa10..2120cea55 100644 --- a/data/tabular/ld50_zhu/meta.yaml +++ b/data/tabular/ld50_zhu/meta.yaml @@ -51,3 +51,82 @@ bibtex: title = {Quantitative Structure-Activity Relationship Modeling of Rat Acute Toxicity by Oral Exposure}, journal = {Chemical Research in Toxicology}} +templates: + - The molecule with the {SMILES__description} {#representation of |!}{SMILES#} has an {acute_toxicity__names__noun} value of {acute_toxicity#}. + - Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule has an {acute_toxicity__names__noun} value of {acute_toxicity#}. + - The {SMILES__description} {SMILES#} {#represents|is from!} a molecule with an {acute_toxicity__names__noun} of {acute_toxicity#}. + - The {#molecule |!}{SMILES__description} {SMILES#} has an {acute_toxicity__names__noun} of {acute_toxicity#}. + - |- + Task: Please predict a property for a molecule based on the description. + Description: Predict the {acute_toxicity__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: You must provide a numerical estimate in units of {acute_toxicity__units}. + Result: {acute_toxicity#} + - |- + Task: Please predict a property for a molecule based on the description. + Description: Predict the {acute_toxicity__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Answer the question in a {#full|complete!} sentence. + Result: This molecule has an {acute_toxicity__names__noun} value of {acute_toxicity#} {acute_toxicity__units}. + - |- + Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. + Description: A molecule with an {acute_toxicity__names__noun} of approximately {acute_toxicity#} {acute_toxicity__units}. + Result: {SMILES#} + - |- + User: Can you {#tell me|derive|estimate!} the {acute_toxicity__names__noun} for the molecule with the {SMILES__description} {SMILES#}? + Assistant: Based on my analysis, the {acute_toxicity__names__noun} for this molecule is approximately {acute_toxicity#} {acute_toxicity__units}. + - |- + User: What is the {acute_toxicity__names__noun} value for the molecule with the {SMILES__description} {SMILES#}? + Assistant: The {acute_toxicity__names__noun} value for this molecule is approximately {acute_toxicity#} {acute_toxicity__units}. + - |- + User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that has an {acute_toxicity__names__noun} value of approximately {acute_toxicity#} {acute_toxicity__units}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} + - |- + User: I'm {#searching|looking!} for the {SMILES__description} of a molecule with an {acute_toxicity__names__noun} value close to {acute_toxicity#} {acute_toxicity__units}. + Assistant: This molecule has an {acute_toxicity__names__noun} value of approximately {acute_toxicity#} {acute_toxicity__units}: {SMILES#} + - |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? + User: Yes, please. The molecule should have an {acute_toxicity__names__noun} value of approximately {acute_toxicity#} {acute_toxicity__units}. + Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} has an {acute_toxicity__names__noun} value of approximately {acute_toxicity#} {acute_toxicity__units}: {SMILES#} + - |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? + User: Yes, the molecule should have an {acute_toxicity__names__noun} value close to {acute_toxicity#} {acute_toxicity__units}. + Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} has an {acute_toxicity__names__noun} value of approximately {acute_toxicity#} {acute_toxicity__units}: {SMILES#} + - |- + What is the {acute_toxicity__names__noun} value for the {SMILES__description} {SMILES#}: {acute_toxicity#} + - |- + Task: Please predict a property for a molecule based on the description. + Description: Predict the {acute_toxicity__names__noun}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: You must provide a numerical estimate in units of {acute_toxicity__units}. + Result: {acute_toxicity#} + - |- + Task: Please answer the multiple choice question. + Question: Which molecule has the highest {acute_toxicity__names__noun}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%acute_toxicity%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Which molecule has the highest {acute_toxicity__names__noun}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%acute_toxicity%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Rank these molecules from lowest to highest {acute_toxicity__names__noun}. + Constraint: You must select all options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%acute_toxicity%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Rank these molecules from lowest to highest {acute_toxicity__names__noun}. + Constraint: You must select all options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {SMILES%acute_toxicity%} + Answer: {%multiple_choice_result} \ No newline at end of file diff --git a/data/tabular/ld50_zhu/transform.py b/data/tabular/ld50_zhu/transform.py index e6a12a22b..be94231b2 100644 --- a/data/tabular/ld50_zhu/transform.py +++ b/data/tabular/ld50_zhu/transform.py @@ -44,96 +44,6 @@ def get_and_transform_data(): fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { - "name": "ld50_zhu", # unique identifier, we will also use this for directory names - "description": """Acute toxicity LD50 measures -the most conservative dose that can lead to lethal adverse effects. -The higher the dose, the more lethal of a drug.""", - "targets": [ - { - "id": "acute_toxicity", # name of the column in a tabular dataset - "description": "Acute Toxicity LD50.", # description of what this column means - "units": "log(1/(mol/kg))", # units of the values in this column (leave empty if unitless) - "type": "continuous", - "names": [ - {"noun": "acute toxicity rat LD50"}, - {"noun": "acute toxicity (LD50 in rats)"}, - {"noun": "LD50 in rats"}, - {"noun": "rat LD50"}, - ], - "uris": ["http://www.bioassayontology.org/bao#BAO_0002117"], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - { - "id": "compound_name", - "type": "Other", - "description": "compound name", - "names": [ - {"noun": "compound"}, - {"noun": "compound name"}, - {"noun": "drug"}, - ], - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1021/tx900189p", - "description": "corresponding publication", - }, - ], - "benchmarks": [ - { - "name": "TDC", - "link": "https://tdcommons.ai/", - "split_column": "split", - } - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Zhu2009, -doi = {10.1021/tx900189p}, -url = {https://doi.org/10.1021/tx900189p}, -year = {2009}, -month = oct, -publisher = {American Chemical Society ({ACS})}, -volume = {22}, -number = {12}, -pages = {1913--1921}, -author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander -Sedykh and Douglas M. Young and Alexander Tropsha}, -title = {Quantitative Structure-Activity Relationship Modeling -of Rat Acute Toxicity by Oral Exposure}, -journal = {Chemical Research in Toxicology}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: - https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/lipophilicity/meta.yaml b/data/tabular/lipophilicity/meta.yaml index 66d21c546..c09fd7c46 100644 --- a/data/tabular/lipophilicity/meta.yaml +++ b/data/tabular/lipophilicity/meta.yaml @@ -57,41 +57,43 @@ templates: Question: What is the {exp__names__noun} of the {SMILES__description} {SMILES#}? Options: {exp%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Question: What is the {exp__names__noun} for the {#molecule|chemical|compound!} represented by the {SMILES__description} {SMILES#}? - Answer:{exp#} + Answer:{exp#} - |- Task: Determine the {exp__names__noun} for the given {SMILES__description}. Molecule: {SMILES#} - Answer:{exp#} + Answer:{exp#} - |- Task: Please {#estimate|guess|predict|provide!} the {exp__names__noun} for the following {SMILES__description}. Molecule: {SMILES#} - Answer:{exp#} + Answer:{exp#} - |- Question: What is the experimental {exp__names__noun} for the molecule with the {SMILES__description} {SMILES#}? - Answer:{exp#} + Answer:{exp#} - |- Task: Identify the {exp__names__noun} for the given {#molecule|chemical|compound!}. Molecule: {SMILES#} - Answer:{exp#} + Answer:{exp#} - |- Task: Please select the correct {exp__names__noun} for the {#molecule|chemical|compound!} represented by the {SMILES__description} {SMILES#}. {#Pick|Choose|Select!} {%multiple_choice_enum%3%aA1}. Options: {exp%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: {#Estimate|Guess|Predict|Provide!} the {exp__names__noun} for the {#molecule|chemical|compound!} with the {SMILES__description} {SMILES#}. - Answer:{exp#} + Answer:{exp#} diff --git a/data/tabular/lipophilicity/transform.py b/data/tabular/lipophilicity/transform.py index 1b47ca94c..98c282017 100644 --- a/data/tabular/lipophilicity/transform.py +++ b/data/tabular/lipophilicity/transform.py @@ -131,8 +131,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml b/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml index c222cdaa0..e47590c98 100644 --- a/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml +++ b/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml @@ -125,26 +125,27 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {m1_muscarinic_agonist#not &NULL}be {m1_muscarinic_agonist__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {m1_muscarinic_agonist#not &NULL}{m1_muscarinic_agonist__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {m1_muscarinic_agonist__names__gerund}:{m1_muscarinic_agonist#no&yes} + - Is the {SMILES__description} {SMILES#} {m1_muscarinic_agonist__names__gerund}:{m1_muscarinic_agonist#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {m1_muscarinic_agonist__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{m1_muscarinic_agonist#False&True} + Result: {m1_muscarinic_agonist#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {m1_muscarinic_agonist__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {m1_muscarinic_agonist#not &NULL}{m1_muscarinic_agonist__names__gerund}. + Result:This molecule is {m1_muscarinic_agonist#not &NULL}{m1_muscarinic_agonist__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {m1_muscarinic_agonist__names__gerund}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {m1_muscarinic_agonist%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {m1_muscarinic_agonist#not &NULL}{m1_muscarinic_agonist__names__gerund}? @@ -158,4 +159,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%m1_muscarinic_agonist%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz/transform.py b/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz/transform.py index a0c1d38d0..4bac109cf 100644 --- a/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz/transform.py +++ b/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz/transform.py @@ -228,8 +228,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml b/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml index 188ad2067..dd530724b 100644 --- a/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml +++ b/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml @@ -127,26 +127,27 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {m1_muscarinic_antagonist#not &NULL}be {m1_muscarinic_antagonist__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {m1_muscarinic_antagonist#not &NULL}{m1_muscarinic_antagonist__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {m1_muscarinic_antagonist__names__gerund}:{m1_muscarinic_antagonist#no&yes} + - Is the {SMILES__description} {SMILES#} {m1_muscarinic_antagonist__names__gerund}:{m1_muscarinic_antagonist#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {m1_muscarinic_antagonist__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{m1_muscarinic_antagonist#False&True} + Result: {m1_muscarinic_antagonist#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {m1_muscarinic_antagonist__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {m1_muscarinic_antagonist#not &NULL}{m1_muscarinic_antagonist__names__gerund}. + Result:This molecule is {m1_muscarinic_antagonist#not &NULL}{m1_muscarinic_antagonist__names__gerund}. - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {m1_muscarinic_antagonist__names__gerund}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {m1_muscarinic_antagonist%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {m1_muscarinic_antagonist#not &NULL}{m1_muscarinic_antagonist__names__gerund}? @@ -160,4 +161,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%m1_muscarinic_antagonist%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py b/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py index 89a5b4ab5..3fae45dbd 100644 --- a/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py +++ b/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py @@ -220,8 +220,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/mattermodeling_stackexchange/meta.yaml b/data/tabular/mattermodeling_stackexchange/meta.yaml index 294c49de3..7a5273f38 100644 --- a/data/tabular/mattermodeling_stackexchange/meta.yaml +++ b/data/tabular/mattermodeling_stackexchange/meta.yaml @@ -28,3 +28,9 @@ templates: {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!} {#Question: |Inquiry: |\n!}{#q} {#Assistant: |Title: |Answer: !}{#title} + - |- + {#Task: Generate a question based on the answer.|Task: Create a question that corresponds to the answer.|Task: Formulate a question that matches the answer.|Task: Develop a question that aligns with the answer.|Task: Construct a question that is answered by the provided response.|Task: Create a question that is relevant to the answer.!} + {#Answer: |Response: |Solution: |!}{#a} + {#Assistant: |Question: |Inquiry: |!}{#q} + - The answer to the {#question|help request|query!} "{#q}" is "{#a}". + - The title of the {#question|help request|query!} "{#q}" is "{#title}". \ No newline at end of file diff --git a/data/tabular/melting_points/meta.yaml b/data/tabular/melting_points/meta.yaml index 2db430833..aedf06a08 100644 --- a/data/tabular/melting_points/meta.yaml +++ b/data/tabular/melting_points/meta.yaml @@ -93,17 +93,17 @@ templates: Compound: {NAME#} - Result:{mp#} {mp__units} + Result: {mp#} {mp__units} - |- Task: Please estimate the {mp__names__noun} of a compound. {SMILES__description}: {SMILES#} - Result:{mp#} {mp__units} + Result: {mp#} {mp__units} - |- Question: What is the {mp__names__noun} of a compound with the {SMILES__description} {SMILES#} in {mp__units}? - Answer:{mp#} + Answer:{mp#} - |- Question: Which molecule has a {mp__names__noun} of {mp#} {mp__units}? Pick {%multiple_choice_enum%3%aA1}. diff --git a/data/tabular/mofdscribe/meta.yaml b/data/tabular/mofdscribe/meta.yaml index e437af8ed..dcf9f0502 100644 --- a/data/tabular/mofdscribe/meta.yaml +++ b/data/tabular/mofdscribe/meta.yaml @@ -58,3 +58,25 @@ templates: Task: {#Create|Generate|Propose!} a {cif__names__noun} of a {#metal-organic framework|MOF|crystal structure|structure|material!} with the following description {description#}. {#Answer: |A: |!}{cif#} + - |- + The {cif__names__noun} {cif#} describes a {#metal-organic framework|MOF|crystal structure|structure|material!} with the following description + {description#}. + - |- + Task: Translate the description below into its corresponding {cif__names__noun} {cif#} for a metal-organic framework. + Description: {description#} + - |- + Based on the following details of a metal-organic framework, generate the {cif__names__noun} {cif#}. + Details: {description#} + - |- + Please convert this MOF structure description into its {cif__names__noun} {cif#} representation. + {description#} + - |- + Task: Using the description provided, formulate the matching {cif__names__noun} {cif#} that encapsulates the metal-organic framework's structure. + Input: {description#} + - |- + Instruction: Derive a {cif__names__noun} {cif#} file for the metal-organic framework described as follows. + Response: {description#} + - The {cif__names__noun} of {#metal-organic framework|MOF|crystal structure|structure|material!} matching the description {description#} is {cif#}. + - |- + Convert the given metal-organic framework description into a comprehensive {cif__names__noun} {cif#}. + Response: {description#} diff --git a/data/tabular/mona/meta.yaml b/data/tabular/mona/meta.yaml index 3f991800a..f74ec7787 100644 --- a/data/tabular/mona/meta.yaml +++ b/data/tabular/mona/meta.yaml @@ -1,84 +1,96 @@ name: mona -description: MassBank of North America, public repository of mass spectra for small molecules +description: MassBank of North America, public repository of mass spectra for small + molecules targets: - - id: spectral_entropy - type: continuous - units: nat - names: - - noun: spectral entropy - description: The entropy of the spectrum. - - id: normalized_entropy - type: continuous - units: - names: - - noun: normalized entropy - description: The normalized entropy of the spectrum (ratio of spectral entropy to maximum possible entropy for a spectrum with the same number of peaks). +- id: spectral_entropy + type: continuous + units: nat + names: + - noun: spectral entropy + description: The entropy of the spectrum. +- id: normalized_entropy + type: continuous + units: null + names: + - noun: normalized entropy + description: The normalized entropy of the spectrum (ratio of spectral entropy to + maximum possible entropy for a spectrum with the same number of peaks). identifiers: - - id: SMILES - type: SMILES - description: SMILES - - id: inchi - type: InChI - description: InChI - - id: id - type: Other - description: MassBank ID - sample: "False" +- id: SMILES + type: SMILES + description: SMILES +- id: inchi + type: InChI + description: InChI +- id: inchikey + type: InChIKey + description: InChIKey +- id: id + type: Other + description: MassBank ID + sample: 'False' license: CC BY 4.0 links: - - name: MassBank of North America - url: https://mona.fiehnlab.ucdavis.edu/ - description: original repository - - name: HuggingFace dataset upload - url: https://huggingface.co/datasets/adamoyoung/mona - description: HuggingFace dataset upload +- name: MassBank of North America + url: https://mona.fiehnlab.ucdavis.edu/ + description: original repository +- name: HuggingFace dataset upload + url: https://huggingface.co/datasets/adamoyoung/mona + description: HuggingFace dataset upload benchmarks: [] num_points: 194721 bibtex: [] templates: - - The molecule with the {SMILES__description} {#representation of |!}{SMILES#} has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - - Based on the {SMILES__description} {#representation of |!}{SMILES#}, the molecule has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - - The {SMILES__description} {SMILES#} {#represents|is representing!} a molecule {#that has a|with a!} {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - - The molecule with the {SMILES__description} {SMILES#} has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - - |- - Task: Please predict a molecule feature based on the description. - Description: Predict the {spectral_entropy__names__noun} in {spectral_entropy__units}. - {#Molecule |!}{SMILES__description}: {SMILES#} - Constraint: Even if you are {#uncertain|not sure!}, you must answer with a numeric value in {spectral_entropy__units} without using any {#other|additional!} words. - Result: {spectral_entropy#} {spectral_entropy__units} - - |- - Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. - Description: A molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - Result: {SMILES#} - - |- - User: Can you {#tell me|derive|estimate!} the {spectral_entropy__names__noun} in {spectral_entropy__units} of the molecule with the {SMILES__description} {SMILES#}? - Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, this molecule has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - - |- - User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}? - Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} - - |- - User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - Assistant: This is a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}: {SMILES#} - - |- - User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. - Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? - User: Yes, please. The molecule should have a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} represents a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}: {SMILES#} - - |- - User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. - Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? - User: Yes, the molecule should have a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} represents a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}: {SMILES#} - - The {spectral_entropy__names__noun} of the molecule with the {SMILES__description} {SMILES#} is:{spectral_entropy#} {spectral_entropy__units} - - The {spectral_entropy__names__noun} of the {SMILES__description} {SMILES#} is:{spectral_entropy#} {spectral_entropy__units} - - The {spectral_entropy__names__noun} of the molecule {SMILES__description} {SMILES#} is:{spectral_entropy#} {spectral_entropy__units} - - |- - Task: Please predict a molecule feature based on the description. - Description: Predict the {spectral_entropy__names__noun} in {spectral_entropy__units} of a molecule. - {#Molecule |!}{SMILES__description}: {SMILES#} - Constraint: Even if you are {#uncertain|not sure!}, you must answer with a numeric value in {spectral_entropy__units} without using any {#other|additional!} words. - Result:{spectral_entropy#} {spectral_entropy__units} - - |- - Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. - Description: A molecule that has {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. - Result:{SMILES#} +- The molecule with the {SMILES__description} {#representation of |!}{SMILES#} has + a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. +- Based on the {SMILES__description} {#representation of |!}{SMILES#}, the molecule + has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. +- The {SMILES__description} {SMILES#} {#represents|is representing!} a molecule {#that + has a|with a!} {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. +- The molecule with the {SMILES__description} {SMILES#} has a {spectral_entropy__names__noun} + of {spectral_entropy#} {spectral_entropy__units}. +- |- + Task: Please predict a molecule feature based on the description. + Description: Predict the {spectral_entropy__names__noun} in {spectral_entropy__units}. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Even if you are {#uncertain|not sure!}, you must answer with a numeric value in {spectral_entropy__units} without using any {#other|additional!} words. + Result: {spectral_entropy#} {spectral_entropy__units} +- |- + Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. + Description: A molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. + Result: {SMILES#} +- |- + User: Can you {#tell me|derive|estimate!} the {spectral_entropy__names__noun} in {spectral_entropy__units} of the molecule with the {SMILES__description} {SMILES#}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, this molecule has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. +- |- + User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} +- |- + User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. + Assistant: This is a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}: {SMILES#} +- |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? + User: Yes, please. The molecule should have a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. + Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} represents a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}: {SMILES#} +- |- + User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? + User: Yes, the molecule should have a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. + Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} represents a molecule that has a {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}: {SMILES#} +- The {spectral_entropy__names__noun} of the molecule with the {SMILES__description} + {SMILES#} is:{spectral_entropy#} {spectral_entropy__units} +- The {spectral_entropy__names__noun} of the {SMILES__description} {SMILES#} is:{spectral_entropy#} + {spectral_entropy__units} +- The {spectral_entropy__names__noun} of the molecule {SMILES__description} {SMILES#} + is:{spectral_entropy#} {spectral_entropy__units} +- |- + Task: Please predict a molecule feature based on the description. + Description: Predict the {spectral_entropy__names__noun} in {spectral_entropy__units} of a molecule. + {#Molecule |!}{SMILES__description}: {SMILES#} + Constraint: Even if you are {#uncertain|not sure!}, you must answer with a numeric value in {spectral_entropy__units} without using any {#other|additional!} words. + Result:{spectral_entropy#} {spectral_entropy__units} +- |- + Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. + Description: A molecule that has {spectral_entropy__names__noun} of {spectral_entropy#} {spectral_entropy__units}. + Result:{SMILES#} diff --git a/data/tabular/moses/transform.py b/data/tabular/moses/transform.py index 5d5de23a2..c382a2f1f 100644 --- a/data/tabular/moses/transform.py +++ b/data/tabular/moses/transform.py @@ -100,8 +100,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/mp_descriptions/meta.yaml b/data/tabular/mp_descriptions/meta.yaml index e61c31814..c854e573a 100644 --- a/data/tabular/mp_descriptions/meta.yaml +++ b/data/tabular/mp_descriptions/meta.yaml @@ -95,3 +95,5 @@ templates: Assistant: {#Sure, |I can give it a try, |!} I {#would need|need|require!} the description of the {#crystal structure|material|compound|material structure|structure!} to do that. User: {description#} Assistant: {cifstr#} + - The {#crystal structure|material|compound|material structure|structure!} described by {description#} is represented by the {cifstr__names__noun} {cifstr#}. + - The {cifstr__names__noun} {cifstr#} corresponds to the {#crystal structure|material|compound|material structure|structure!} described by {description#}. \ No newline at end of file diff --git a/data/tabular/ncbi_disease/meta.yaml b/data/tabular/ncbi_disease/meta.yaml index fb26f760d..d895d89de 100644 --- a/data/tabular/ncbi_disease/meta.yaml +++ b/data/tabular/ncbi_disease/meta.yaml @@ -52,3 +52,12 @@ templates: User: Does the following text contain mentions of diseases?{# Can you return matches?| Can you output matches?!} {#Text: |!}{sentence#} Assistant: {#I found|There is!} {matched_words#} + - |- + Question: Are there any mentions of diseases in the following text? + {#Text: |!}{sentence#} + Answer: {matched_words#} + - |- + The words {matched_words#} in the sentence {sentence#} are mentions of diseases. + - |- + Task: Identify all the disease mentions in the following text: {sentence#}. + Solution: {matched_words#} \ No newline at end of file diff --git a/data/tabular/nlmchem/meta.yaml b/data/tabular/nlmchem/meta.yaml index 88cbbd8ad..4fe444031 100644 --- a/data/tabular/nlmchem/meta.yaml +++ b/data/tabular/nlmchem/meta.yaml @@ -37,8 +37,11 @@ bibtex: doi = {10.1038/s41597-021-00875-1}, url = {https://doi.org/10.1038/s41597-021-00875-1} } + templates: - The {Abbreviation__names__noun} "{Abbreviation#}" stands for "{#Full_Form}". + - The abbreviation "{Abbreviation#}" {#matches the full form|stands for|is associated with!} "{Full_Form#}". + - In other {#words|terms!}, "{Abbreviation#}" is the abbreviation for "{Full_Form#}". - |- Task: Please give me the {Full_Form__names__noun} of the {Abbreviation__names__noun}. Abbreviation: {Abbreviation#} @@ -62,9 +65,12 @@ templates: Task: Please give me the {Full_Form__names__noun} of the {Abbreviation__names__noun}. Abbreviation: {Abbreviation#} Constraint: Answer the question with {#full|complete!} words. - Result:{Full_Form#} + Result: {Full_Form#} - |- Task: Please give me the {Abbreviation__names__noun} of the following {Full_Form__names__noun}. Full form or meaning of the abbreviation: {Full_Form#} Constraint: Answer the question with an {Abbreviation__names__noun}. - Result:{Abbreviation#} + Result: {Abbreviation#} + - |- + User: I'm {#searching for|looking for!trying to find} the {Abbreviation__names__noun} for: {#Full_Form} + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {Abbreviation#} diff --git a/data/tabular/nlmchem/transform.py b/data/tabular/nlmchem/transform.py index 3a7446f66..a9d87c05c 100644 --- a/data/tabular/nlmchem/transform.py +++ b/data/tabular/nlmchem/transform.py @@ -30,106 +30,7 @@ def get_and_transform_data(): # Save to CSV fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - - # Create meta.yaml - meta = { - "name": "NLM-Chem", - "description": ( - "NLM-Chem is a new resource for chemical entity recognition in PubMed full text literature." - ), - "identifiers": [ - { - "id": "Abbreviation", - "description": "abbreviation of a term", - "type": "Other", - "names": [{"noun": "abbreviation"}], - }, - { - "id": "MeSH_Identifier", - "description": "unique codes for Medical Subject Headings", - "type": "categorical", - "names": [{"noun": "MeSH identifier"}], - "sample": False, - }, - ], - "targets": [ - { - "id": "Full_Form", - "description": "full form or meaning of the abbreviation", - "type": "categorical", - "names": [{"noun": "full form or meaning"}], - }, - ], - "license": "CC BY 4.0", - "links": [ - { - "url": "https://ftp.ncbi.nlm.nih.gov/pub/lu/NLMChem/", - "description": "data source", - }, - { - "url": "https://www.nature.com/articles/s41597-021-00875-1", - "description": "publication", - }, - ], - "num_points": len(df), - "bibtex": [ - """@article{Islamaj2021, -author = {Islamaj, R. and Leaman, R. and Kim, S. and Lu, Z.}, -title = {NLM-Chem, a new resource for chemical entity recognition in PubMed full text literature}, -journal = {Nature Scientific Data}, -volume = {8}, -number = {91}, -year = {2021}, -doi = {10.1038/s41597-021-00875-1}, -url = {https://doi.org/10.1038/s41597-021-00875-1} -}""", - ], - "templates": [ - 'The {Abbreviation__names__noun} "{Abbreviation#}" stands for "{#Full_Form}".', # noqa - """Task: Please give me the {Full_Form__names__noun} of the {Abbreviation__names__noun}. -Abbreviation: {Abbreviation#} -Constraint: Answer the question with {#full|complete!} words. -Result: {Full_Form#}""", # noqa - """Task: Please give me the {Abbreviation__names__noun} of the following {Full_Form__names__noun}. -Full form or meaning of the abbreviation: {Full_Form#} -Constraint: Answer the question with an {Abbreviation__names__noun}. -Result: {Abbreviation#}""", # noqa - """User: Can you give me the {Abbreviation__names__noun} of the following {Full_Form__names__noun}: {#Full_Form} -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {Abbreviation#}""", # noqa - """User: Can you give me the {Full_Form__names__noun} of the following {Abbreviation__names__noun}: {#Abbreviation} -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {Full_Form#}""", # noqa - """User: I'm {#searching|looking!} for the {Abbreviation__names__noun} for: {#Full_Form} -Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {Abbreviation#}""", # noqa - """Task: Please give me the {Full_Form__names__noun} of the {Abbreviation__names__noun}. -Abbreviation: {Abbreviation#} -Constraint: Answer the question with {#full|complete!} words. -Result:{Full_Form#}""", # noqa - """Task: Please give me the {Abbreviation__names__noun} of the following {Full_Form__names__noun}. -Full form or meaning of the abbreviation: {Full_Form#} -Constraint: Answer the question with an {Abbreviation__names__noun}. -Result:{Abbreviation#}""", # noqa - ], - } - - def str_presenter(dumper, data): - """Configure yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # Check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer(str, str_presenter) - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - # Add the file_path code here - file_path = os.path.abspath(fn_meta) - print(f"Meta.yaml is being saved at: {file_path}") - print(f"Finished processing {meta['name']} dataset!") - + print("Processed data saved to", fn_data_csv) if __name__ == "__main__": get_and_transform_data() diff --git a/data/tabular/nr_ahr_tox21/meta.yaml b/data/tabular/nr_ahr_tox21/meta.yaml index bc964782c..982fafaec 100644 --- a/data/tabular/nr_ahr_tox21/meta.yaml +++ b/data/tabular/nr_ahr_tox21/meta.yaml @@ -92,17 +92,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_NR-AhR#not &NULL}be {toxicity_NR-AhR__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_NR-AhR#not &NULL}{toxicity_NR-AhR__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_NR-AhR__names__adjective}:{toxicity_NR-AhR#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_NR-AhR__names__adjective}:{toxicity_NR-AhR#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_NR-AhR__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_NR-AhR#False&True} + Result: {toxicity_NR-AhR#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_NR-AhR__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_NR-AhR__names__adjective}? @@ -116,7 +116,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_NR-AhR%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_NR-AhR#not &NULL}{toxicity_NR-AhR__names__adjective}? @@ -130,4 +131,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_NR-AhR%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/nr_ahr_tox21/transform.py b/data/tabular/nr_ahr_tox21/transform.py index 522c694a5..4d116d9ae 100644 --- a/data/tabular/nr_ahr_tox21/transform.py +++ b/data/tabular/nr_ahr_tox21/transform.py @@ -203,8 +203,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/nr_ar_lbd_tox21/meta.yaml b/data/tabular/nr_ar_lbd_tox21/meta.yaml index ac6226ed8..012ffca57 100644 --- a/data/tabular/nr_ar_lbd_tox21/meta.yaml +++ b/data/tabular/nr_ar_lbd_tox21/meta.yaml @@ -92,17 +92,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_NR-AR-LBD#not &NULL}be {toxicity_NR-AR-LBD__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_NR-AR-LBD#not &NULL}{toxicity_NR-AR-LBD__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_NR-AR-LBD__names__adjective}:{toxicity_NR-AR-LBD#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_NR-AR-LBD__names__adjective}:{toxicity_NR-AR-LBD#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_NR-AR-LBD__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_NR-AR-LBD#False&True} + Result: {toxicity_NR-AR-LBD#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_NR-AR-LBD__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_NR-AR-LBD__names__adjective}? @@ -116,7 +116,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_NR-AR-LBD%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_NR-AR-LBD#not &NULL}{toxicity_NR-AR-LBD__names__adjective}? @@ -130,4 +131,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_NR-AR-LBD%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/nr_ar_lbd_tox21/transform.py b/data/tabular/nr_ar_lbd_tox21/transform.py index f2da520d4..35cc969a8 100644 --- a/data/tabular/nr_ar_lbd_tox21/transform.py +++ b/data/tabular/nr_ar_lbd_tox21/transform.py @@ -206,8 +206,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/nr_ar_tox21/meta.yaml b/data/tabular/nr_ar_tox21/meta.yaml index 7e09db035..7a68b8d76 100644 --- a/data/tabular/nr_ar_tox21/meta.yaml +++ b/data/tabular/nr_ar_tox21/meta.yaml @@ -92,17 +92,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_NR-AR#not &NULL}be {toxicity_NR-AR__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_NR-AR#not &NULL}{toxicity_NR-AR__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_NR-AR__names__adjective}:{toxicity_NR-AR#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_NR-AR__names__adjective}:{toxicity_NR-AR#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_NR-AR__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_NR-AR#False&True} + Result: {toxicity_NR-AR#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_NR-AR__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_NR-AR__names__adjective}? @@ -116,7 +116,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_NR-AR%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_NR-AR#not &NULL}{toxicity_NR-AR__names__adjective}? @@ -130,4 +131,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_NR-AR%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/nr_ar_tox21/transform.py b/data/tabular/nr_ar_tox21/transform.py index 4951a69d7..12aeb5991 100644 --- a/data/tabular/nr_ar_tox21/transform.py +++ b/data/tabular/nr_ar_tox21/transform.py @@ -198,8 +198,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/nr_aromatase_tox21/meta.yaml b/data/tabular/nr_aromatase_tox21/meta.yaml index 9f8a2527a..236fd6083 100644 --- a/data/tabular/nr_aromatase_tox21/meta.yaml +++ b/data/tabular/nr_aromatase_tox21/meta.yaml @@ -92,17 +92,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_NR-Aromatase#not &NULL}be {toxicity_NR-Aromatase__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_NR-Aromatase#not &NULL}{toxicity_NR-Aromatase__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_NR-Aromatase__names__adjective}:{toxicity_NR-Aromatase#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_NR-Aromatase__names__adjective}:{toxicity_NR-Aromatase#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_NR-Aromatase__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_NR-Aromatase#False&True} + Result: {toxicity_NR-Aromatase#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_NR-Aromatase__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_NR-Aromatase__names__adjective}? @@ -116,7 +116,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_NR-Aromatase%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_NR-Aromatase#not &NULL}{toxicity_NR-Aromatase__names__adjective}? @@ -130,4 +131,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_NR-Aromatase%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/nr_aromatase_tox21/transform.py b/data/tabular/nr_aromatase_tox21/transform.py index 08b8b0719..f2cb7460b 100644 --- a/data/tabular/nr_aromatase_tox21/transform.py +++ b/data/tabular/nr_aromatase_tox21/transform.py @@ -201,8 +201,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/nr_er_lbd_tox21/meta.yaml b/data/tabular/nr_er_lbd_tox21/meta.yaml index 1fdcb8b5e..1867258d0 100644 --- a/data/tabular/nr_er_lbd_tox21/meta.yaml +++ b/data/tabular/nr_er_lbd_tox21/meta.yaml @@ -97,17 +97,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_NR-ER-LBD#not &NULL}be {toxicity_NR-ER-LBD__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_NR-ER-LBD#not &NULL}{toxicity_NR-ER-LBD__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_NR-ER-LBD__names__adjective}:{toxicity_NR-ER-LBD#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_NR-ER-LBD__names__adjective}:{toxicity_NR-ER-LBD#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_NR-ER-LBD__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_NR-ER-LBD#False&True} + Result: {toxicity_NR-ER-LBD#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_NR-ER-LBD__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_NR-ER-LBD__names__adjective}? @@ -121,7 +121,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_NR-ER-LBD%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_NR-ER-LBD#not &NULL}{toxicity_NR-ER-LBD__names__adjective}? @@ -135,4 +136,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_NR-ER-LBD%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/nr_er_lbd_tox21/transform.py b/data/tabular/nr_er_lbd_tox21/transform.py index 94fb3a959..4c7e76807 100644 --- a/data/tabular/nr_er_lbd_tox21/transform.py +++ b/data/tabular/nr_er_lbd_tox21/transform.py @@ -220,8 +220,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/nr_er_tox21/meta.yaml b/data/tabular/nr_er_tox21/meta.yaml index 1cd12711d..bd72fa914 100644 --- a/data/tabular/nr_er_tox21/meta.yaml +++ b/data/tabular/nr_er_tox21/meta.yaml @@ -97,17 +97,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_NR-ER#not &NULL}be {toxicity_NR-ER__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_NR-ER#not &NULL}{toxicity_NR-ER__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_NR-ER__names__adjective}:{toxicity_NR-ER#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_NR-ER__names__adjective}:{toxicity_NR-ER#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_NR-ER__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_NR-ER#False&True} + Result: {toxicity_NR-ER#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_NR-ER__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_NR-ER__names__adjective}? @@ -121,7 +121,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_NR-ER%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_NR-ER#not &NULL}{toxicity_NR-ER__names__adjective}? @@ -135,4 +136,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_NR-ER%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/nr_er_tox21/transform.py b/data/tabular/nr_er_tox21/transform.py index cb4a8bc55..c166a7194 100644 --- a/data/tabular/nr_er_tox21/transform.py +++ b/data/tabular/nr_er_tox21/transform.py @@ -210,8 +210,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/nr_ppar_gamma_tox21/meta.yaml b/data/tabular/nr_ppar_gamma_tox21/meta.yaml index ba3f47178..5513bfb4f 100644 --- a/data/tabular/nr_ppar_gamma_tox21/meta.yaml +++ b/data/tabular/nr_ppar_gamma_tox21/meta.yaml @@ -93,17 +93,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_NR-PPAR-gamma#not &NULL}be {toxicity_NR-PPAR-gamma__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_NR-PPAR-gamma#not &NULL}{toxicity_NR-PPAR-gamma__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_NR-PPAR-gamma__names__adjective}:{toxicity_NR-PPAR-gamma#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_NR-PPAR-gamma__names__adjective}:{toxicity_NR-PPAR-gamma#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_NR-PPAR-gamma__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_NR-PPAR-gamma#False&True} + Result: {toxicity_NR-PPAR-gamma#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_NR-PPAR-gamma__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_NR-PPAR-gamma__names__adjective}? @@ -117,7 +117,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_NR-PPAR-gamma%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_NR-PPAR-gamma#not &NULL}{toxicity_NR-PPAR-gamma__names__adjective}? @@ -131,4 +132,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_NR-PPAR-gamma%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/nr_ppar_gamma_tox21/transform.py b/data/tabular/nr_ppar_gamma_tox21/transform.py index f10de13f5..830210c65 100644 --- a/data/tabular/nr_ppar_gamma_tox21/transform.py +++ b/data/tabular/nr_ppar_gamma_tox21/transform.py @@ -208,8 +208,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/ocp/meta.yaml b/data/tabular/ocp/meta.yaml index bc414d6aa..7c412ec1a 100644 --- a/data/tabular/ocp/meta.yaml +++ b/data/tabular/ocp/meta.yaml @@ -43,3 +43,19 @@ templates: Task: {#Predict|Estimate|Calculate|Compute|Determine!} the adsorption energy of the following adsorbate-adsorbent pair. Text: {text#} Answer: {target#} {target__units} + - The adsorption energy of the following adsorbate-adsorbent pair is {target#} {target__units}. + - |- + {#Task|Problem statement!}: Answer the {#multiple choice|multiple-choice|MCQ!} question. + {#Question|Query!}: What is the {target__names__noun} of a {#compound|drug!} with the {text__description} {text#}? + Constraint: You must based on the text description, return none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Text: {text#} + Options: + {target%} + Answer: {%multiple_choice_result}. {target#} {target__units} + - |- + {#Task|Problem statement!}: Answer the {#multiple choice|multiple-choice|MCQ!} question. + {#Question|Query!}: Which description corresponds to an adsorption energy of {target#} {target__units}? + Constraint: You must return none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {text%} + Answer: {%multiple_choice_result} diff --git a/data/tabular/opv/meta.yaml b/data/tabular/opv/meta.yaml index e2dd8c79d..0b9d57c4c 100644 --- a/data/tabular/opv/meta.yaml +++ b/data/tabular/opv/meta.yaml @@ -158,19 +158,19 @@ templates: - |- Question: What is the {Voc__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#}? - Answer:{Voc#} {Voc__units} + Answer:{Voc#} {Voc__units} - |- Question: What is the {Jsc__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#}? - Answer:{Jsc#} {Jsc__units} + Answer:{Jsc#} {Jsc__units} - |- Question: What is the {FF__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#}? - Answer:{FF#} + Answer:{FF#} - |- Question: What is the {bandgap__names__noun} of a polymer with monomer {SMILES__description} {SMILES#}? - Answer:{bandgap#} {bandgap__units} + Answer:{bandgap#} {bandgap__units} - |- Task: Please answer the multiple choice question. @@ -182,7 +182,8 @@ templates: {PCE_ave%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. @@ -194,7 +195,8 @@ templates: {Voc%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. @@ -206,7 +208,8 @@ templates: {Jsc%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. @@ -218,7 +221,8 @@ templates: {FF%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. @@ -228,4 +232,5 @@ templates: {bandgap%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/ord_masked/meta.yaml b/data/tabular/ord_masked/meta.yaml index 8c1486436..047255dc9 100644 --- a/data/tabular/ord_masked/meta.yaml +++ b/data/tabular/ord_masked/meta.yaml @@ -48,3 +48,8 @@ templates: Task: Predict the masked component in a {masked_rxn_smiles__names__noun}. Description: {masked_rxn_smiles#} {#Answer|Solution!}: {missing_component#} + - Analyze the reaction given by {masked_rxn_smiles#}; the chemical entity concealed by "MASK" is identified as {missing_component#}. + + + + diff --git a/data/tabular/ord_procedure_steps/meta.yaml b/data/tabular/ord_procedure_steps/meta.yaml index 4c2102f4e..4d024b89a 100644 --- a/data/tabular/ord_procedure_steps/meta.yaml +++ b/data/tabular/ord_procedure_steps/meta.yaml @@ -51,3 +51,7 @@ templates: Task: Convert a {procedure__names__noun} into a {steps_string__names__noun}. Procedure: {procedure#} Answer: {steps_string#} + - |- + The {procedure__names__noun} {procedure#} {#is|involves|requires!} the following {steps_string__names__noun}: {steps_string#} + - |- + The {steps_string__names__noun} {steps_string#} {#are|is!} part of the {procedure__names__noun} {procedure#} diff --git a/data/tabular/ord_rxn_smiles_yield_pred/meta.yaml b/data/tabular/ord_rxn_smiles_yield_pred/meta.yaml index 92ce5a078..3107e1d0b 100644 --- a/data/tabular/ord_rxn_smiles_yield_pred/meta.yaml +++ b/data/tabular/ord_rxn_smiles_yield_pred/meta.yaml @@ -50,3 +50,8 @@ templates: - |- Question: {#What is|What's|What is the|What's the!} {yield__names__noun} of a reaction with the {RXNSMILES__names__noun} {RXNSMILES#}? Answer: {yield#}{yield__units}. + - |- + Task: {#Predict|Estimate|Calculate|Determine!} the {yield__names__noun} of a reaction with the {RXNSMILES__names__noun} {RXNSMILES#}. + Solution: {yield#}{yield__units}. + - |- + The {#reaction|chemical reaction!} with the {RXNSMILES__names__noun} {RXNSMILES#} has a {yield__names__noun} of {yield#}{yield__units}. \ No newline at end of file diff --git a/data/tabular/ord_steps_yield/meta.yaml b/data/tabular/ord_steps_yield/meta.yaml index 40087f18d..bd19b43ac 100644 --- a/data/tabular/ord_steps_yield/meta.yaml +++ b/data/tabular/ord_steps_yield/meta.yaml @@ -51,3 +51,20 @@ templates: Task: {#Predict|Estimate!} the {yield__names__noun} of a reaction based on the {non_yield_steps_string__names__noun}. Description: {non_yield_steps_string#} Answer: {yield#}{yield__units} + - |- + Given a chemical reaction with {non_yield_steps_string__names__noun} "{non_yield_steps_string#}", the {yield__names__noun} is {yield#}{yield__units}. + - |- + The {#expected|predicted|resulting!} {yield__names__noun} for a reaction following these steps: {non_yield_steps_string#} is {yield#}{yield__units}. + - |- + User: What {yield__names__noun} can I expect when performing this reaction? + Reaction details: {non_yield_steps_string#} + Assistant: Based on these reaction conditions, you should achieve a {yield__names__noun} of {yield#}{yield__units}. + - |- + User: Calculate the {yield__names__noun} for this synthetic procedure: + {non_yield_steps_string#} + Assistant: The calculated {yield__names__noun} for this procedure is {yield#}{yield__units}. + + - |- + {#Analyzing|Evaluating|Assessing!} the following {non_yield_steps_string__names__noun}: + "{non_yield_steps_string#}" + The {yield__names__noun} prediction is {yield#}{yield__units}. diff --git a/data/tabular/orexin1_receptor_butkiewicz/meta.yaml b/data/tabular/orexin1_receptor_butkiewicz/meta.yaml index 8284c38ef..a3590b3fd 100644 --- a/data/tabular/orexin1_receptor_butkiewicz/meta.yaml +++ b/data/tabular/orexin1_receptor_butkiewicz/meta.yaml @@ -134,17 +134,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_orexin1#not &NULL}be {activity_orexin1__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_orexin1#not &NULL}{activity_orexin1__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_orexin1__names__adjective}:{activity_orexin1#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_orexin1__names__adjective}:{activity_orexin1#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_orexin1__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_orexin1#False&True} + Result: {activity_orexin1#False&True} - |- Task: Please {#give me|create|generate!} the {SMILES__description} of a {#molecule|chemical|chemical structure!} based on the {#text |!}description{# below|!}. Description: A molecule that is {activity_orexin1__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_orexin1__names__adjective}? @@ -158,7 +158,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_orexin1%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_orexin1#not &NULL}{activity_orexin1__names__adjective}? @@ -172,4 +173,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional|extra!} words. Options: {SMILES%activity_orexin1%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/orexin1_receptor_butkiewicz/transform.py b/data/tabular/orexin1_receptor_butkiewicz/transform.py index ac290a1a7..a0bc4b393 100644 --- a/data/tabular/orexin1_receptor_butkiewicz/transform.py +++ b/data/tabular/orexin1_receptor_butkiewicz/transform.py @@ -220,8 +220,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al/meta.yaml b/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al/meta.yaml index eee4e831f..b322a293a 100644 --- a/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al/meta.yaml +++ b/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al/meta.yaml @@ -99,17 +99,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {Pgp_inhibition#not &NULL}be {Pgp_inhibition__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {Pgp_inhibition#not &NULL}{Pgp_inhibition__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {Pgp_inhibition__names__adjective}:{Pgp_inhibition#no&yes} + - Is the {SMILES__description} {SMILES#} {Pgp_inhibition__names__adjective}:{Pgp_inhibition#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {Pgp_inhibition__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{Pgp_inhibition#False&True} + Result: {Pgp_inhibition#False&True} - |- Task: Please {#give me|create|generate!} a {SMILES__description} of a {#molecule|chemical|chemical compound!} based on the {#text |!}description{# below|!}. Description: A molecule that is {Pgp_inhibition__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {Pgp_inhibition__names__adjective}? @@ -123,7 +123,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {Pgp_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {Pgp_inhibition#not &NULL}{Pgp_inhibition__names__adjective}? @@ -137,4 +138,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%Pgp_inhibition%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al/transform.py b/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al/transform.py index e9244958c..f82320767 100644 --- a/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al/transform.py +++ b/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al/transform.py @@ -208,8 +208,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/pampa_ncats/example_processing_and_templates.ipynb b/data/tabular/pampa_ncats/example_processing_and_templates.ipynb index 625a862cb..a45f26e85 100644 --- a/data/tabular/pampa_ncats/example_processing_and_templates.ipynb +++ b/data/tabular/pampa_ncats/example_processing_and_templates.ipynb @@ -754,7 +754,7 @@ "outputs": [], "source": [ "with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)" + " #yaml.dump(meta, f, sort_keys=False)" ] }, { @@ -964,7 +964,7 @@ " ) # to use with safe_dum\n", " fn_meta = \"meta.yaml\"\n", " with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)\n", + " #yaml.dump(meta, f, sort_keys=False)\n", "\n", " print(f\"Finished processing {meta['name']} dataset!\")\n", "\n", diff --git a/data/tabular/pampa_ncats/meta.yaml b/data/tabular/pampa_ncats/meta.yaml index 8ba6dc79b..c6cb667f0 100644 --- a/data/tabular/pampa_ncats/meta.yaml +++ b/data/tabular/pampa_ncats/meta.yaml @@ -93,17 +93,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {permeability#not &NULL}be {permeability__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {permeability#not &NULL}{permeability__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {permeability__names__adjective}:{permeability#no&yes} + - Is the {SMILES__description} {SMILES#} {permeability__names__adjective}:{permeability#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {permeability__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{permeability#False&True} + Result: {permeability#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {permeability__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {permeability__names__adjective}? @@ -117,7 +117,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {permeability%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {permeability#not &NULL}{permeability__names__adjective}? @@ -131,4 +132,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%permeability%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/pampa_ncats/transform.py b/data/tabular/pampa_ncats/transform.py index 3d640103b..21b6078c3 100644 --- a/data/tabular/pampa_ncats/transform.py +++ b/data/tabular/pampa_ncats/transform.py @@ -186,8 +186,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/peptides_hemolytic/meta.yaml b/data/tabular/peptides_hemolytic/meta.yaml index cee39ecc5..efb91b1e0 100644 --- a/data/tabular/peptides_hemolytic/meta.yaml +++ b/data/tabular/peptides_hemolytic/meta.yaml @@ -96,17 +96,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#amino acid sequence|one!}? User: Yes, the amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#} - - Is the {sequence__description} {sequence#} {hemolytic__names__adjective}:{hemolytic#no&yes} + - Is the {sequence__description} {sequence#} {hemolytic__names__adjective}:{hemolytic#no&yes} - |- Task: Please classify a {#amino acid sequence|sequence of amino acids|peptide!} based on the description. Description: A amino acid sequence that is {hemolytic__names__adjective}. {#amino acid sequence|sequence of amino acids!}: {sequence#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{hemolytic#False&True} + Result: {hemolytic#False&True} - |- Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!} based on the {#text |!}description{# below|!}. Description: A {#amino acid sequence|sequence of amino acids|peptide!} that is {hemolytic__names__adjective}. - Result:{sequence#} + Result: {sequence#} - |- Task: Please answer the multiple choice question. Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}? @@ -120,7 +120,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {hemolytic%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}? @@ -134,4 +135,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {sequence%hemolytic%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/peptides_hemolytic/transform.py b/data/tabular/peptides_hemolytic/transform.py index 000c7cfe3..30b8a8f83 100644 --- a/data/tabular/peptides_hemolytic/transform.py +++ b/data/tabular/peptides_hemolytic/transform.py @@ -235,8 +235,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/peptides_nonfouling/meta.yaml b/data/tabular/peptides_nonfouling/meta.yaml index a04f3a862..c1b2a3fda 100644 --- a/data/tabular/peptides_nonfouling/meta.yaml +++ b/data/tabular/peptides_nonfouling/meta.yaml @@ -95,17 +95,17 @@ templates: Assistant: {#This sounds very exciting. |Nice. | Very interesting. |I would love to help you. |This sounds very interesting. !}Should it be a special {#amino acid sequence|one!}? User: Yes, the amino acid sequence should {nonfouling#not &NULL}be {nonfouling__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this is {nonfouling#not &NULL}{nonfouling__names__adjective}: {sequence#} - - Is the {sequence#} {nonfouling__names__adjective}:{nonfouling#no&yes} + - Is the {sequence#} {nonfouling__names__adjective}:{nonfouling#no&yes} - |- Task: Please classify a amino acid sequence based on the description. Description: A amino acid sequence that is {nonfouling__names__adjective}. {#amino acid sequence |!}: {sequence#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{nonfouling#False&True} + Result: {nonfouling#False&True} - |- Task: Please {#give me|create|generate!} a {#amino acid sequence |!} based on the {#text |!}description{# below|!}. Description: A amino acid sequence that is {nonfouling__names__adjective}. - Result:{sequence#} + Result: {sequence#} - |- Task: Please answer the multiple choice question. Question: Is the amino acid sequence with the {#representation of |!}{sequence#} {nonfouling__names__adjective}? @@ -119,7 +119,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {nonfouling%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which amino acid sequences are {nonfouling#not &NULL}{nonfouling__names__adjective}? @@ -133,4 +134,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {sequence%nonfouling%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/peptides_nonfouling/transform.py b/data/tabular/peptides_nonfouling/transform.py index 4d1879cd0..6ecd05ec7 100644 --- a/data/tabular/peptides_nonfouling/transform.py +++ b/data/tabular/peptides_nonfouling/transform.py @@ -217,8 +217,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/peptides_soluble/meta.yaml b/data/tabular/peptides_soluble/meta.yaml index f656e0995..dbff352a2 100644 --- a/data/tabular/peptides_soluble/meta.yaml +++ b/data/tabular/peptides_soluble/meta.yaml @@ -96,17 +96,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#amino acid sequence|one!}? User: Yes, the amino acid sequence should {soluble#not &NULL}be {soluble__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {sequence__description} is {soluble#not &NULL}{soluble__names__adjective}: {sequence#} - - Is the {sequence__description} {sequence#} {soluble__names__adjective}:{soluble#no&yes} + - Is the {sequence__description} {sequence#} {soluble__names__adjective}:{soluble#no&yes} - |- Task: Please classify a amino acid sequence based on the description. Description: A amino acid sequence that is {soluble__names__adjective}. {#amino acid sequence |sequence|AA sequence!}: {sequence#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{soluble#False&True} + Result: {soluble#False&True} - |- Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence|AA sequence!} based on the {#text |!}description{# below|!}. Description: A amino acid sequence that is {soluble__names__adjective}. - Result:{sequence#} + Result: {sequence#} - |- Task: Please answer the multiple choice question. Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {soluble__names__adjective}? @@ -120,7 +120,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {soluble%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which amino acid sequences are {soluble#not &NULL}{soluble__names__adjective}? @@ -134,4 +135,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {sequence%soluble%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/peptides_soluble/transform.py b/data/tabular/peptides_soluble/transform.py index d14d0a1b6..8c5cd0e3a 100644 --- a/data/tabular/peptides_soluble/transform.py +++ b/data/tabular/peptides_soluble/transform.py @@ -217,8 +217,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/physics_stackexchange/meta.yaml b/data/tabular/physics_stackexchange/meta.yaml index 2aca00745..8baee67c4 100644 --- a/data/tabular/physics_stackexchange/meta.yaml +++ b/data/tabular/physics_stackexchange/meta.yaml @@ -28,3 +28,10 @@ templates: {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!} {#Question: |Inquiry: |\n!}{#q} {#Assistant: |Title: |Answer: |!}{#title} + - |- + {#Task: Generate a question based on the answer.|Task: Create a question that corresponds to the answer.|Task: Formulate a question that matches the answer.|Task: Develop a question that aligns with the answer.|Task: Construct a question that is answered by the provided response.|Task: Create a question that is relevant to the answer.!} + {#Answer: |Response: |Solution: |!}{#a} + {#Assistant: |Question: |Inquiry: |!}{#q} + - The answer to the {#question|help request|query!} "{#q}" is "{#a}". + - The title of the {#question|help request|query!} "{#q}" is "{#title}". + diff --git a/data/tabular/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml b/data/tabular/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml index d77f89cbc..3035fc36f 100644 --- a/data/tabular/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml +++ b/data/tabular/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml @@ -135,17 +135,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_potassium_ion_channel#not &NULL}be {activity_potassium_ion_channel__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_potassium_ion_channel#not &NULL}{activity_potassium_ion_channel__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_potassium_ion_channel__names__adjective}:{activity_potassium_ion_channel#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_potassium_ion_channel__names__adjective}:{activity_potassium_ion_channel#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_potassium_ion_channel__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_potassium_ion_channel#False&True} + Result: {activity_potassium_ion_channel#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {activity_potassium_ion_channel__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_potassium_ion_channel__names__adjective}? @@ -159,7 +159,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_potassium_ion_channel%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_potassium_ion_channel#not &NULL}{activity_potassium_ion_channel__names__adjective}? @@ -173,4 +174,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%activity_potassium_ion_channel%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/potassium_ion_channel_kir2_1_butkiewicz/transform.py b/data/tabular/potassium_ion_channel_kir2_1_butkiewicz/transform.py index 03f5ab250..484a3cc0a 100644 --- a/data/tabular/potassium_ion_channel_kir2_1_butkiewicz/transform.py +++ b/data/tabular/potassium_ion_channel_kir2_1_butkiewicz/transform.py @@ -232,8 +232,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/qm8/transform.py b/data/tabular/qm8/transform.py index a90d76b5c..50b5ee8f8 100644 --- a/data/tabular/qm8/transform.py +++ b/data/tabular/qm8/transform.py @@ -1,4 +1,8 @@ import pandas as pd +from rdkit import Chem + +def canonicize_smiles(smiles): + return Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) def process(): @@ -28,6 +32,7 @@ def process(): # the values for all those columns are floats. df = df[df[columns].apply(lambda x: x.apply(lambda y: isinstance(y, float))).all(1)] df[columns] = df[columns].astype(float) + df["SMILES"] = df["SMILES"].apply(canonicize_smiles) print(len(df)) df.to_csv("data_clean.csv", index=False) diff --git a/data/tabular/rhea_db_masked/meta.yaml b/data/tabular/rhea_db_masked/meta.yaml index 032502b4b..09673cf39 100644 --- a/data/tabular/rhea_db_masked/meta.yaml +++ b/data/tabular/rhea_db_masked/meta.yaml @@ -67,3 +67,4 @@ templates: Task: Predict the masked component in a {masked_rxn_smiles__names__noun}. Description: {masked_rxn_smiles#} {#Answer|Solution!}: {missing_component#} + - In the {masked_rxn_smiles__names__noun} {masked_rxn_smiles#}, the undisclosed {#chemical|compound!} has been identified as {missing_component#}. diff --git a/data/tabular/sarscov2_3clpro_diamond/meta.yaml b/data/tabular/sarscov2_3clpro_diamond/meta.yaml index 97b7e0f9a..07c42c02a 100644 --- a/data/tabular/sarscov2_3clpro_diamond/meta.yaml +++ b/data/tabular/sarscov2_3clpro_diamond/meta.yaml @@ -113,30 +113,31 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_SARSCoV2_3CLPro#not &NULL}be {activity_SARSCoV2_3CLPro__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_SARSCoV2_3CLPro#not &NULL}{activity_SARSCoV2_3CLPro__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_SARSCoV2_3CLPro__names__gerund}:{activity_SARSCoV2_3CLPro#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_SARSCoV2_3CLPro__names__gerund}:{activity_SARSCoV2_3CLPro#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_SARSCoV2_3CLPro__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_SARSCoV2_3CLPro#False&True} + Result: {activity_SARSCoV2_3CLPro#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_SARSCoV2_3CLPro__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {activity_SARSCoV2_3CLPro#not &NULL}{activity_SARSCoV2_3CLPro__names__gerund}. + Result:This molecule is {activity_SARSCoV2_3CLPro#not &NULL}{activity_SARSCoV2_3CLPro__names__gerund}. - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {activity_SARSCoV2_3CLPro__names__gerund}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_SARSCoV2_3CLPro__names__gerund}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_SARSCoV2_3CLPro%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_SARSCoV2_3CLPro#not &NULL}{activity_SARSCoV2_3CLPro__names__gerund}? @@ -150,4 +151,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%activity_SARSCoV2_3CLPro%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/sarscov2_3clpro_diamond/transform.py b/data/tabular/sarscov2_3clpro_diamond/transform.py index e088a60d6..a390f48a5 100644 --- a/data/tabular/sarscov2_3clpro_diamond/transform.py +++ b/data/tabular/sarscov2_3clpro_diamond/transform.py @@ -216,8 +216,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/sarscov2_vitro_touret/meta.yaml b/data/tabular/sarscov2_vitro_touret/meta.yaml index c0000923b..072aeb111 100644 --- a/data/tabular/sarscov2_vitro_touret/meta.yaml +++ b/data/tabular/sarscov2_vitro_touret/meta.yaml @@ -88,30 +88,31 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_SARSCoV2#not &NULL}be {activity_SARSCoV2__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_SARSCoV2#not &NULL}{activity_SARSCoV2__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_SARSCoV2__names__gerund}:{activity_SARSCoV2#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_SARSCoV2__names__gerund}:{activity_SARSCoV2#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_SARSCoV2__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_SARSCoV2#False&True} + Result: {activity_SARSCoV2#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_SARSCoV2__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {activity_SARSCoV2#not &NULL}{activity_SARSCoV2__names__gerund}. + Result:This molecule is {activity_SARSCoV2#not &NULL}{activity_SARSCoV2__names__gerund}. - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {activity_SARSCoV2__names__gerund}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_SARSCoV2__names__gerund}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_SARSCoV2%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_SARSCoV2#not &NULL}{activity_SARSCoV2__names__gerund}? @@ -125,4 +126,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%activity_SARSCoV2%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/sarscov2_vitro_touret/transform.py b/data/tabular/sarscov2_vitro_touret/transform.py index 903076d69..997efd0c7 100644 --- a/data/tabular/sarscov2_vitro_touret/transform.py +++ b/data/tabular/sarscov2_vitro_touret/transform.py @@ -186,8 +186,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/serine_threonine_kinase_33_butkiewicz/meta.yaml b/data/tabular/serine_threonine_kinase_33_butkiewicz/meta.yaml index 889c01852..4ba94f0a6 100644 --- a/data/tabular/serine_threonine_kinase_33_butkiewicz/meta.yaml +++ b/data/tabular/serine_threonine_kinase_33_butkiewicz/meta.yaml @@ -128,30 +128,31 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_serine_threonine_kinase33#not &NULL}be {activity_serine_threonine_kinase33__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_serine_threonine_kinase33#not &NULL}{activity_serine_threonine_kinase33__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_serine_threonine_kinase33__names__gerund}:{activity_serine_threonine_kinase33#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_serine_threonine_kinase33__names__gerund}:{activity_serine_threonine_kinase33#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_serine_threonine_kinase33__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_serine_threonine_kinase33#False&True} + Result: {activity_serine_threonine_kinase33#False&True} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_serine_threonine_kinase33__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Answer the question in a {#full|complete!} sentence. - Result:This molecule is {activity_serine_threonine_kinase33#not &NULL}{activity_serine_threonine_kinase33__names__gerund}. + Result:This molecule is {activity_serine_threonine_kinase33#not &NULL}{activity_serine_threonine_kinase33__names__gerund}. - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {activity_serine_threonine_kinase33__names__gerund}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_serine_threonine_kinase33__names__gerund}? Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_serine_threonine_kinase33%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_serine_threonine_kinase33#not &NULL}{activity_serine_threonine_kinase33__names__gerund}? @@ -165,4 +166,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%activity_serine_threonine_kinase33%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/serine_threonine_kinase_33_butkiewicz/transform.py b/data/tabular/serine_threonine_kinase_33_butkiewicz/transform.py index e8b7cb205..ee2f1f47d 100644 --- a/data/tabular/serine_threonine_kinase_33_butkiewicz/transform.py +++ b/data/tabular/serine_threonine_kinase_33_butkiewicz/transform.py @@ -231,8 +231,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/skin_reaction/meta.yaml b/data/tabular/skin_reaction/meta.yaml index cda530fe5..b2782f6f3 100644 --- a/data/tabular/skin_reaction/meta.yaml +++ b/data/tabular/skin_reaction/meta.yaml @@ -102,17 +102,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {skin_reaction#not &NULL}be {skin_reaction__names__gerund}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {skin_reaction#not &NULL}{skin_reaction__names__gerund}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {skin_reaction__names__gerund}:{skin_reaction#no&yes} + - Is the {SMILES__description} {SMILES#} {skin_reaction__names__gerund}:{skin_reaction#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {skin_reaction__names__gerund}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{skin_reaction#False&True} + Result: {skin_reaction#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {skin_reaction__names__gerund}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {skin_reaction__names__gerund}? @@ -126,7 +126,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {skin_reaction%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {skin_reaction#not &NULL}{skin_reaction__names__gerund}? @@ -140,4 +141,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%skin_reaction%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/skin_reaction/transform.py b/data/tabular/skin_reaction/transform.py index d4bc8da50..a52347246 100644 --- a/data/tabular/skin_reaction/transform.py +++ b/data/tabular/skin_reaction/transform.py @@ -222,8 +222,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/solubility_aqsoldb/meta.yaml b/data/tabular/solubility_aqsoldb/meta.yaml index 327fcf110..cd0b183ef 100644 --- a/data/tabular/solubility_aqsoldb/meta.yaml +++ b/data/tabular/solubility_aqsoldb/meta.yaml @@ -64,3 +64,10 @@ templates: Assistant: I {#recommend|suggest|propose|advise|!} the {#compound|drug|chemical|molecule!} with the {SMILES__description} {SMILES#}. {#Is there anything else I can do for you?|Do you need anything else?|Anything else?|!} User: {#Yes, |!}I would like to know the {compound_name__names__noun} of the {#compound|drug|chemical|molecule!}. Assistant: The {compound_name__names__noun} of the {#compound|drug|chemical|molecule!} with the {SMILES__description} {SMILES#} is {compound_name#}. + - For the given {#compound|drug|chemical|molecule!} represented by its {SMILES__description} {SMILES#}, the calculated {aqeuous_solubility__names__noun} is {aqeuous_solubility#} {aqeuous_solubility__units}. + - In our analysis, the {aqeuous_solubility__names__noun} of the {#compound|drug|chemical|molecule!} with structure {SMILES__description} {SMILES#} is reported as {aqeuous_solubility#} {aqeuous_solubility__units}. + - Measured at room temperature, the {aqeuous_solubility__names__noun} for the {#compound|drug|chemical|molecule!} identified by its {compound_name__names__noun} {compound_name#} and depicted by {SMILES__description} {SMILES#} is {aqeuous_solubility#} {aqeuous_solubility__units}. + - Our dataset indicates that the {aqeuous_solubility__names__noun} of a {#compound|drug|chemical|molecule!} with {SMILES__description} {SMILES#} is determined to be {aqeuous_solubility#} {aqeuous_solubility__units}. + - For the {#compound|drug|chemical|molecule!} specified by its {compound_name__names__noun} {compound_name#} and represented with {SMILES__description} {SMILES#}, the {aqeuous_solubility__names__noun} is quantified at {aqeuous_solubility#} {aqeuous_solubility__units}. + - The computed {aqeuous_solubility__names__noun} of the {#compound|drug|chemical|molecule!} having structure {SMILES__description} {SMILES#} registers as {aqeuous_solubility#} {aqeuous_solubility__units}. + - When evaluating the {#compound|drug|chemical|molecule!} with {SMILES__description} {SMILES#}, we observe a {aqeuous_solubility__names__noun} value of {aqeuous_solubility#} {aqeuous_solubility__units}. diff --git a/data/tabular/solubility_aqsoldb/transform.py b/data/tabular/solubility_aqsoldb/transform.py index 816ef5ff6..1829ca0b4 100644 --- a/data/tabular/solubility_aqsoldb/transform.py +++ b/data/tabular/solubility_aqsoldb/transform.py @@ -137,7 +137,7 @@ def get_and_transform_data(): # ) # to use with safe_dum # fn_meta = "meta.yaml" # with open(fn_meta, "w") as f: - # yaml.dump(meta, f, sort_keys=False) + # #yaml.dump(meta, f, sort_keys=False) # print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/sr_are_tox21/meta.yaml b/data/tabular/sr_are_tox21/meta.yaml index 2a7edf09c..9506f772c 100644 --- a/data/tabular/sr_are_tox21/meta.yaml +++ b/data/tabular/sr_are_tox21/meta.yaml @@ -99,17 +99,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_SR-ARE#not &NULL}be {toxicity_SR-ARE__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_SR-ARE#not &NULL}{toxicity_SR-ARE__names__adjective}: {SMILES#} -- Is the {SMILES__description} {SMILES#} {toxicity_SR-ARE__names__adjective}:{toxicity_SR-ARE#no&yes} +- Is the {SMILES__description} {SMILES#} {toxicity_SR-ARE__names__adjective}:{toxicity_SR-ARE#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_SR-ARE__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_SR-ARE#False&True} + Result: {toxicity_SR-ARE#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_SR-ARE__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_SR-ARE__names__adjective}? @@ -123,7 +123,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_SR-ARE%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + # - |- # Task: Please answer the multiple choice question. # Question: Which molecules are {toxicity_SR-ARE#not &NULL}{toxicity_SR-ARE__names__adjective}? @@ -137,4 +138,5 @@ templates: # Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. # Options: # {SMILES%toxicity_SR-ARE%} -# Answer:{%multiple_choice_result} +# Answer: {%multiple_choice_result} + diff --git a/data/tabular/sr_are_tox21/transform.py b/data/tabular/sr_are_tox21/transform.py index e622e87aa..69bd02d25 100644 --- a/data/tabular/sr_are_tox21/transform.py +++ b/data/tabular/sr_are_tox21/transform.py @@ -205,8 +205,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/sr_atad5_tox21/meta.yaml b/data/tabular/sr_atad5_tox21/meta.yaml index 0115535e8..b51c9efce 100644 --- a/data/tabular/sr_atad5_tox21/meta.yaml +++ b/data/tabular/sr_atad5_tox21/meta.yaml @@ -94,17 +94,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_SR-ATAD5#not &NULL}be {toxicity_SR-ATAD5__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_SR-ATAD5#not &NULL}{toxicity_SR-ATAD5__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_SR-ATAD5__names__adjective}:{toxicity_SR-ATAD5#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_SR-ATAD5__names__adjective}:{toxicity_SR-ATAD5#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_SR-ATAD5__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_SR-ATAD5#False&True} + Result: {toxicity_SR-ATAD5#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_SR-ATAD5__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_SR-ATAD5__names__adjective}? @@ -118,7 +118,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_SR-ATAD5%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_SR-ATAD5#not &NULL}{toxicity_SR-ATAD5__names__adjective}? @@ -132,4 +133,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_SR-ATAD5%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/sr_atad5_tox21/transform.py b/data/tabular/sr_atad5_tox21/transform.py index 6d0f59cab..e66a9442e 100644 --- a/data/tabular/sr_atad5_tox21/transform.py +++ b/data/tabular/sr_atad5_tox21/transform.py @@ -213,8 +213,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/sr_hse_tox21/meta.yaml b/data/tabular/sr_hse_tox21/meta.yaml index bd69b7c5a..6621d8cde 100644 --- a/data/tabular/sr_hse_tox21/meta.yaml +++ b/data/tabular/sr_hse_tox21/meta.yaml @@ -94,17 +94,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_SR-HSE#not &NULL}be {toxicity_SR-HSE__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_SR-HSE#not &NULL}{toxicity_SR-HSE__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_SR-HSE__names__adjective}:{toxicity_SR-HSE#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_SR-HSE__names__adjective}:{toxicity_SR-HSE#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_SR-HSE__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_SR-HSE#False&True} + Result: {toxicity_SR-HSE#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_SR-HSE__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_SR-HSE__names__adjective}? @@ -118,7 +118,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_SR-HSE%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {toxicity_SR-HSE#not &NULL}{toxicity_SR-HSE__names__adjective}? @@ -132,4 +133,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%toxicity_SR-HSE%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/sr_hse_tox21/transform.py b/data/tabular/sr_hse_tox21/transform.py index c4ee9788a..e75d58f22 100644 --- a/data/tabular/sr_hse_tox21/transform.py +++ b/data/tabular/sr_hse_tox21/transform.py @@ -201,8 +201,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/sr_mmp_tox21/meta.yaml b/data/tabular/sr_mmp_tox21/meta.yaml index 3dd42d263..84d1f1c6c 100644 --- a/data/tabular/sr_mmp_tox21/meta.yaml +++ b/data/tabular/sr_mmp_tox21/meta.yaml @@ -94,17 +94,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_SR-MMP#not &NULL}be {toxicity_SR-MMP__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_SR-MMP#not &NULL}{toxicity_SR-MMP__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {toxicity_SR-MMP__names__adjective}:{toxicity_SR-MMP#no&yes} + - Is the {SMILES__description} {SMILES#} {toxicity_SR-MMP__names__adjective}:{toxicity_SR-MMP#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_SR-MMP__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_SR-MMP#False&True} + Result: {toxicity_SR-MMP#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_SR-MMP__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_SR-MMP__names__adjective}? @@ -118,4 +118,5 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_SR-MMP%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/sr_mmp_tox21/transform.py b/data/tabular/sr_mmp_tox21/transform.py index 9649356a1..4378c20f3 100644 --- a/data/tabular/sr_mmp_tox21/transform.py +++ b/data/tabular/sr_mmp_tox21/transform.py @@ -209,8 +209,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/sr_p53_tox21/meta.yaml b/data/tabular/sr_p53_tox21/meta.yaml index e8c0eb5be..f9d219c22 100644 --- a/data/tabular/sr_p53_tox21/meta.yaml +++ b/data/tabular/sr_p53_tox21/meta.yaml @@ -98,17 +98,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {toxicity_SR-p53#not &NULL}be {toxicity_SR-p53__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {toxicity_SR-p53#not &NULL}{toxicity_SR-p53__names__adjective}: {SMILES#} -- Is the {SMILES__description} {SMILES#} {toxicity_SR-p53__names__adjective}:{toxicity_SR-p53#no&yes} +- Is the {SMILES__description} {SMILES#} {toxicity_SR-p53__names__adjective}:{toxicity_SR-p53#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {toxicity_SR-p53__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{toxicity_SR-p53#False&True} + Result: {toxicity_SR-p53#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {toxicity_SR-p53__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {toxicity_SR-p53__names__adjective}? @@ -122,4 +122,5 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {toxicity_SR-p53%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/sr_p53_tox21/transform.py b/data/tabular/sr_p53_tox21/transform.py index 60239e40a..ed8cbf3d4 100644 --- a/data/tabular/sr_p53_tox21/transform.py +++ b/data/tabular/sr_p53_tox21/transform.py @@ -202,8 +202,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/thermosol/meta.yaml b/data/tabular/thermosol/meta.yaml index 76b6a4dab..ecb4beb73 100644 --- a/data/tabular/thermosol/meta.yaml +++ b/data/tabular/thermosol/meta.yaml @@ -63,3 +63,13 @@ templates: Assistant: {#Cool, |Awesome, |Great, |That sounds interesting, |!}I would need to know the {target__names__noun} of the {#compound|drug|chemical|molecule!} you want to design. User: The {target__names__noun} should be {target#} {target__units}. Assistant: I {#recommend|suggest|propose|advise|!} the {#compound|drug|chemical|molecule!} with the {SMILES__description} {SMILES#}. + - |- + {#Question:|Q:!} help me identify which of the options below have a {target__names__noun} of {target#} {target__units}. + Constraint: You must select none, one or more options from {%multiple_choice_enum%3-5%aA1} without using any {#other|additional!} words. + Options: {SMILES%target%} + Answer: {%multiple_choice_result} + - |- + {#Question:|Q:!} help me identify the correct solubility of the {#compound|molecule!} with the {SMILES__description} {SMILES#}. + Constraint: You must select none, one or more options from {%multiple_choice_enum%3-5%aA1} without using any {#other|additional!} words. + Options: {target%SMILES%} + Answer: {%multiple_choice_result} \ No newline at end of file diff --git a/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml b/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml index 47bcb2fdc..c84b36dcf 100644 --- a/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml +++ b/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml @@ -127,17 +127,17 @@ templates: Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? User: Yes, the molecule should {activity_tyrosyl_dna_phosphodiesterase#not &NULL}be {activity_tyrosyl_dna_phosphodiesterase__names__adjective}. Assistant: {#Understood|Got it|Ok!}, this {SMILES__description} is {activity_tyrosyl_dna_phosphodiesterase#not &NULL}{activity_tyrosyl_dna_phosphodiesterase__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {activity_tyrosyl_dna_phosphodiesterase__names__adjective}:{activity_tyrosyl_dna_phosphodiesterase#no&yes} + - Is the {SMILES__description} {SMILES#} {activity_tyrosyl_dna_phosphodiesterase__names__adjective}:{activity_tyrosyl_dna_phosphodiesterase#no&yes} - |- Task: Please classify a molecule based on the description. Description: A molecule that is {activity_tyrosyl_dna_phosphodiesterase__names__adjective}. {#Molecule |!}{SMILES__description}: {SMILES#} Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{activity_tyrosyl_dna_phosphodiesterase#False&True} + Result: {activity_tyrosyl_dna_phosphodiesterase#False&True} - |- Task: Please {#give me|create|generate!} a {#molecule |!}{SMILES__description} based on the {#text |!}description{# below|!}. Description: A molecule that is {activity_tyrosyl_dna_phosphodiesterase__names__adjective}. - Result:{SMILES#} + Result: {SMILES#} - |- Task: Please answer the multiple choice question. Question: Is the molecule with the {SMILES__description} {#representation of |!}{SMILES#} {activity_tyrosyl_dna_phosphodiesterase__names__adjective}? @@ -151,7 +151,8 @@ templates: Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. Options: {activity_tyrosyl_dna_phosphodiesterase%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + - |- Task: Please answer the multiple choice question. Question: Which molecules are {activity_tyrosyl_dna_phosphodiesterase#not &NULL}{activity_tyrosyl_dna_phosphodiesterase__names__adjective}? @@ -165,4 +166,5 @@ templates: Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. Options: {SMILES%activity_tyrosyl_dna_phosphodiesterase%} - Answer:{%multiple_choice_result} + Answer: {%multiple_choice_result} + diff --git a/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py b/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py index a0523e79b..5c5a15a14 100644 --- a/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py +++ b/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py @@ -236,8 +236,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/uniprot_binding_single/meta.yaml b/data/tabular/uniprot_binding_single/meta.yaml index b47d37c88..55037a547 100644 --- a/data/tabular/uniprot_binding_single/meta.yaml +++ b/data/tabular/uniprot_binding_single/meta.yaml @@ -36,8 +36,8 @@ bibtex: doi = {10.1093/nar/gkac1052}, url = {https://doi.org/10.1093/nar/gkac1052}} templates: - # - |- - # The {#molecule|chemical|compound!} with the {SMILES__description}{# representation|!} {SMILES#} binds to the {#AA sequence|amino acid sequence|peptide sequence|protein!} {sequence#} at the {#site|binding site|position!} {start_binding_site#}{#-| to !}{end_binding_site#}. + - |- + The {#molecule|chemical|compound!} with the {SMILES__description}{# representation|!} {SMILES#} binds to the {#AA sequence|amino acid sequence|peptide sequence|protein!} {sequence#} at the {#site|binding site|position!} {start_binding_site#}{#-| to !}{end_binding_site#}. - |- Task: {#Find|Identify|Come up with!} a binding site for the {#molecule|chemical|compound!} in the {#AA sequence|amino acid sequence|peptide sequence|protein!}. {#AA sequence|Amino acid sequence|Peptide sequence|Protein!}: {sequence#} @@ -59,9 +59,9 @@ templates: Task: {#Find|Identify|Come up with!} a binding site in the {#AA sequence|amino acid sequence|peptide sequence|protein!} for the {#molecule|chemical|compound!}. {#AA sequence|Amino acid sequence|Peptide sequence|Protein!}: {sequence#} {SMILES__description}{# representation|!}: {SMILES#} - {#Output|Result!}:{start_binding_site#} + {#Output|Result!}:{start_binding_site#} - |- Task: {#Create|Design|Come up with!} a {#molecule|chemical|compound!} that binds to the given {#binding site|site|position|!} in the {#AA sequence|amino acid sequence|peptide sequence|protein!}. {#AA sequence|Amino acid sequence|Peptide sequence|Protein!}: {sequence#} Binding site{# position|!}: {start_binding_site#} - {#Output|Result!}:{SMILES#} + {#Output|Result!}:{SMILES#} diff --git a/data/tabular/uniprot_binding_sites_multiple/meta.yaml b/data/tabular/uniprot_binding_sites_multiple/meta.yaml index 25e96f59d..06811a9bf 100644 --- a/data/tabular/uniprot_binding_sites_multiple/meta.yaml +++ b/data/tabular/uniprot_binding_sites_multiple/meta.yaml @@ -64,9 +64,9 @@ templates: Task: {#Find|Identify|Come up with!} a binding site in the {#AA sequence|amino acid sequence|peptide sequence|protein!} for the {#molecule|chemical|compound!}. {#AA sequence|Amino acid sequence|Peptide sequence|Protein!}: {sequence#} {SMILES__description}{# representation|!}: {SMILES#} - {#Output|Result!}:{start_binding_site#}-{end_binding_site#} + {#Output|Result!}:{start_binding_site#}-{end_binding_site#} - |- Task: {#Create|Design|Come up with!} a {#molecule|chemical|compound!} that binds to the given {#binding site|site|position|!} in the {#AA sequence|amino acid sequence|peptide sequence|protein!}. {#AA sequence|Amino acid sequence|Peptide sequence|Protein!}: {sequence#} Binding site{# position|!}: {start_binding_site#}{#-| to !}{end_binding_site#} - {#Output|Result!}:{SMILES#} + {#Output|Result!}:{SMILES#} diff --git a/data/tabular/uniprot_organisms/meta.yaml b/data/tabular/uniprot_organisms/meta.yaml index 8cf6ca747..a5a9f4ca6 100644 --- a/data/tabular/uniprot_organisms/meta.yaml +++ b/data/tabular/uniprot_organisms/meta.yaml @@ -43,4 +43,18 @@ templates: - |- Task: {#Predict|Identify!} the organism in which {#the below|this!} {#protein|amino acid sequence|AA sequence|polypeptide!} can be found. {#Amino acid sequence|Sequence|AA sequence!}: {other#} - Result:{organisms#} + Result: {organisms#} + - |- + Task: Please answer the multiple choice question. + Question: In which of these organisms can the protein with the {#amino acid sequence|AA sequence!} {other#} be found in? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {organisms%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Which {#amino acid sequence|AA sequence!} can be found in the organism {organisms#}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional|extra!} words. + Options: + {other%} + Answer: {%multiple_choice_result} \ No newline at end of file diff --git a/data/tabular/uniprot_reactions/meta.yaml b/data/tabular/uniprot_reactions/meta.yaml index e7c4ac94b..dbdfc25be 100644 --- a/data/tabular/uniprot_reactions/meta.yaml +++ b/data/tabular/uniprot_reactions/meta.yaml @@ -48,8 +48,8 @@ templates: - |- Task: {#Predict|Identify!} a {#biochemical |chemical |!}reaction that can be catalyzed by {#this|the following!} {#protein|amino acid sequence|AA sequence|polypeptide!}. {#Amino acid sequence |Sequence|AA sequence!}: {other#} - Result:{reactions#} + Result: {reactions#} - |- Task: {#Generate|Create|Come up with|Design!} a {#protein|amino acid sequence|AA sequence|polypeptide!} that can catalyze {#a|this!} specific {#biochemical |chemical |!}reaction. Reaction: {reactions#} - {#Output|Result!}:{other#} + {#Output|Result!}:{other#} diff --git a/data/tabular/uniprot_sentences/meta.yaml b/data/tabular/uniprot_sentences/meta.yaml index 3fefb81fa..575c49bae 100644 --- a/data/tabular/uniprot_sentences/meta.yaml +++ b/data/tabular/uniprot_sentences/meta.yaml @@ -48,8 +48,8 @@ templates: - |- Task: {#Generate|Create|Come up with!} a description {#of a few sentences |!}for the {#protein|amino acid sequence|AA sequence|polypeptide!}{# below|!}. {#Protein|Amino acid sequence|AA sequence|Polypeptide!}: {sequence#} - {#Output|Result!}:{sentences#} + {#Output|Result!}:{sentences#} - |- Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} based on the description. Description: {sentences#} - {#Output|Result!}:{sequence#} + {#Output|Result!}:{sequence#} diff --git a/data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al/transform.py b/data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al/transform.py index 1d45078bb..d08ec76ff 100644 --- a/data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al/transform.py +++ b/data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al/transform.py @@ -130,8 +130,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/data/tabular/zinc/transform.py b/data/tabular/zinc/transform.py index 8ee2d9361..6e82e0f62 100644 --- a/data/tabular/zinc/transform.py +++ b/data/tabular/zinc/transform.py @@ -98,8 +98,8 @@ def str_presenter(dumper, data): str, str_presenter ) # to use with safe_dum fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) + #with open(fn_meta, "w") as f: + #yaml.dump(meta, f, sort_keys=False) print(f"Finished processing {meta['name']} dataset!") diff --git a/src/chemnlp/data/meta.yaml b/src/chemnlp/data/meta.yaml index 6de661876..4e64cac87 100644 --- a/src/chemnlp/data/meta.yaml +++ b/src/chemnlp/data/meta.yaml @@ -28,4 +28,4 @@ templates: - The compound {compound_name__names__noun} with SMILES {SMILES#} can {#penetrate|not penetrate!} the blood-brain barrier. - The compound {compound_name__names__noun} with SMILES {SMILES#} is in the {split#} set. - "Question: Which of the following compounds can penetrate the blood-brain barrier?\nOptions: {%multiple_choice_enum%4%aA1}\n{compound_name%}\nAnswer: {%multiple_choice_result}" - - The compound with SMILES {SMILES#} can penetrate the blood-brain barrier:{penetrate_BBB#} + - The compound with SMILES {SMILES#} can penetrate the blood-brain barrier:{penetrate_BBB#} diff --git a/src/chemnlp/data/sampler.py b/src/chemnlp/data/sampler.py index 539733587..663c6629c 100644 --- a/src/chemnlp/data/sampler.py +++ b/src/chemnlp/data/sampler.py @@ -295,6 +295,7 @@ def _get_target_from_row(self, sample: pd.Series, var: str) -> str: var_dict = next( x for x in self.meta["identifiers"] + self.meta["targets"] if x["id"] == var ) + if var_dict["type"] == "continuous": if not isinstance(out, (float, int)): raise ValueError(f"out is not a number (int or float): {out}") @@ -307,7 +308,6 @@ def _get_target_from_row(self, sample: pd.Series, var: str) -> str: out = f"{round(out, significant_digits):.{significant_digits}f}" else: out = str(out) - if "|" in out: choices = [ c for c in out.split("|") if isinstance(c, str) or not math.isnan(c) @@ -498,6 +498,7 @@ def _get_choices_with_indicator( correct_choice_indicator = self._get_target_from_row( sample, multiple_choice_indicator + "#" ) + df_sample = self.df.sample(len(symbols) - 1)[ [multiple_choice_var, multiple_choice_indicator] ] @@ -522,7 +523,23 @@ def _get_choices_with_indicator( ) if indicator == correct_choice_indicator ] + var_dict = next( + x + for x in self.meta["identifiers"] + self.meta["targets"] + if x["id"] == multiple_choice_var + ) + if var_dict["type"] == "continuous": + significant_digits = var_dict.get( + "significant_digits", + self.config.get( + "DEFAULT_SIGNIFICANT_DIGITS", DEFAULT_SIGNIFICANT_DIGITS + ), + ) + multiple_choices = [ + f"{round(float(x), significant_digits):.{significant_digits}f}" + for x in multiple_choices + ] return list(multiple_choices), correct_choice_idx def _format_choices(self, symbols: List[str], choices: List[str]) -> str: @@ -701,7 +718,7 @@ def _fill_template( self, template: str, sample_dict: Dict[str, Union[str, List[str]]] ) -> str: for key, value in sample_dict.items(): - if isinstance(value, list): + if isinstance(value, list) and isinstance(value[0], str): value = "\n".join(value) if "#" in key: # This indicates it's an identifier identifier = key.replace("#", "")