diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3bbbdd5..0ee9a06 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: run: npm run test:coverage - name: Archive code coverage results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: code-coverage-report path: ui/coverage/ diff --git a/api/env/Dockerfile_local b/api/env/Dockerfile_local index 6acfd99..8c51d7d 100644 --- a/api/env/Dockerfile_local +++ b/api/env/Dockerfile_local @@ -7,7 +7,7 @@ ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Europe/Prague RUN apt-get update \ - && apt-get install -y libxml2 g++ vim coreutils nano git libxrender1 wget fakeroot tzdata locales nginx parallel \ + && apt-get install -y libxml2 g++ vim coreutils nano git libxrender1 wget fakeroot tzdata locales nginx parallel libhdf5-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* RUN ln -fs /usr/share/zoneinfo/Europe/Prague /etc/localtime \ diff --git a/api/src/search.py b/api/src/search.py index 4325e2a..9a3bb85 100644 --- a/api/src/search.py +++ b/api/src/search.py @@ -132,7 +132,7 @@ def search( def setup_in_memory_data_structures( n_classes: int = 2, - model: str = 'MLP', + model: str = 'MLP10', model_path: str = '../data/models/', bucket_data_path: str = '../data/bucket-data/', ) -> None: diff --git a/docker-compose.yml b/docker-compose.yml index b2e90f8..6ed4f5a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -29,6 +29,7 @@ services: - 8080:8000 volumes: - "./training/models:/home/user/data/models" + - "./training/data:/home/user/data" - "./api/eph:/eph" - "./api/src:/home/user/src" healthcheck: diff --git a/training/.gitignore b/training/.gitignore index 8e6b0a4..cef1b40 100644 --- a/training/.gitignore +++ b/training/.gitignore @@ -8,7 +8,7 @@ secret.yaml kubectl *.h5 data/bucket-* -data/embedding.pkl +data/embedding.parquet data/kmeans.idx models/ .coverage diff --git a/training/Dockerfile b/training/Dockerfile index 1088a6b..7022645 100644 --- a/training/Dockerfile +++ b/training/Dockerfile @@ -7,7 +7,7 @@ ENV LC_ALL=en_US.UTF-8 ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Europe/Prague -RUN apt-get update && apt-get install -y libxml2 g++ vim coreutils nano git libxrender1 wget fakeroot tzdata locales && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y libxml2 g++ vim coreutils nano git libxrender1 wget fakeroot tzdata locales libhdf5-dev && apt-get clean && rm -rf /var/lib/apt/lists/* RUN ln -fs /usr/share/zoneinfo/Europe/Prague /etc/localtime && dpkg-reconfigure --frontend noninteractive tzdata && sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && locale-gen COPY requirements.txt /var/requirements.txt diff --git a/training/alphafind_training/afdb2surfer.py b/training/alphafind_training/afdb2surfer.py new file mode 100644 index 0000000..6df91c3 --- /dev/null +++ b/training/alphafind_training/afdb2surfer.py @@ -0,0 +1,183 @@ +import argparse +import os +import subprocess +import time +from multiprocessing import Pool +from pathlib import Path + +import pandas as pd + + +def _untar(tar_file, input_dir, scratch_dir): + """Helper function to paralelize untaring""" + tar_file_path = str(os.path.join(input_dir, tar_file)) + subprocess.run( + ["tar", "-xf", tar_file_path, "-C", scratch_dir, "--wildcards", "*.cif.gz"] + ) + + +def extract(input_dir, scratch_dir, index): + """Extracts CHUNK of proteins on persistent storage from tars and moves them to the zip folder""" + scrach_dir_loc = str(os.path.join(scratch_dir)) + if not os.path.exists(scrach_dir_loc): + os.mkdir(scrach_dir_loc) + + # first untar and move to zip folder + with open(index, "r") as index_f: + print(f"Taking from index file {index}") + proteins = index_f.readlines() + for tar_protein_file in proteins: + tar_file = tar_protein_file.split(",")[0].strip() + tar_file = tar_file.replace("v3", "v4") + _untar(tar_file, input_dir, scrach_dir_loc) + + # then unzip proteins itself + subprocess.run(["gzip", "-dfr", str(scrach_dir_loc)]) + + return scrach_dir_loc + + +def plytoobj(filename): + obj_filename = filename[:-4] + ".obj" + obj_file = open(obj_filename, "w") + + with open(filename) as ply_file: + ply_file_content = ply_file.read().split("\n")[:-1] + + for content in ply_file_content: + content_info = content.split() + if len(content_info) == 6: + vertex_info = "v " + " ".join(content_info[0:3]) + obj_file.write(vertex_info + "\n") + elif len(content_info) == 7: + vertex1, vertex2, vertex3 = map(int, content_info[1:4]) + vertex1, vertex2, vertex3 = vertex1 + 1, vertex2 + 1, vertex3 + 1 + face_info = ( + "f " + str(vertex1) + " " + str(vertex2) + " " + str(vertex3) + ) + obj_file.write(face_info + "\n") + + obj_file.close() + + +def process_protein(it): + start_time = time.time() + + path, protein_file = it + protein_name = protein_file.split("-")[1] + + # convert from cif -> pdb + subprocess.run( + ["python3", "/scripts/3d-af-surfer/mmcif_to_pdb.py", "--ciffile", protein_file] + ) + + # convert to ply -> obj -> grid -> zernike + subprocess.run( + [ + "/scripts/3d-af-surfer/bin/EDTSurf", + "-i", + f"{protein_file}.pdb", + "-h", + "2", + "-f", + "1", + "-o", + f"{path}{protein_name}", + ] + ) + plytoobj(f"{path}{protein_name}.ply") + subprocess.run( + ["/scripts/3d-af-surfer/bin/obj2grid", "-g", "64", f"{path}{protein_name}.obj"] + ) + subprocess.run( + [ + "/scripts/3d-af-surfer/bin/map2zernike", + f"{path}{protein_name}.obj.grid", + "-c", + "0.5", + ] + ) + + # convert to vector + with open(f"{path}{protein_name}.obj.grid.inv") as f: + embedding = [float(x.strip()) for x in f.readlines()[1:]] + + # clean up + subprocess.run( + [ + "rm", + f"{path}{protein_name}.ply", + f"{path}{protein_name}.obj", + f"{path}{protein_name}.obj.grid", + f"{path}{protein_name}.obj.grid.inv", + ] + ) + + print(f"Processed {protein_name} in {time.time() - start_time} seconds") + return protein_name, embedding + + +def run(input_path, output_path, processes=2): + """Run Invariant-3d-coordinates embedding on a directory of cif files + + Args: + input_path (str): Path to directory containing cif files + output_path (str): Path to save embeddings as parquet + + Returns: + None, saves embeddings as .parquet + """ + index = [] + data = [] + # os.mkdir(f"{input_path}/objdata") + + proteins = os.listdir(input_path) + proteins = [file for file in proteins if file.endswith(".cif")] + print("Starting processing", len(proteins), "files...") + + with Pool(processes) as p: + results = p.map( + process_protein, + [ + ("/scripts/3d-af-surfer/tmp/", f"{input_path}/{protein}") + for protein in proteins + ] + ) + + for result in results: + index.append(result[0]) + data.append(result[1]) + + df = pd.DataFrame(index=index, data=data, dtype="float32") + df.to_parquet(output_path, compression="gzip") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, required=True) + parser.add_argument("--scratch_dir", type=str, required=True) + parser.add_argument("--output", type=str, required=True) + parser.add_argument("--position", type=int, required=False, default=0) + parser.add_argument("--processes", type=int, required=False, default=8) + parser.add_argument("--cache", type=bool, required=False, default=False) + parser.add_argument("--index", type=str, required=False, default=False) + + args = parser.parse_args() + + input_path = Path(args.input) + output_path = Path(args.output) + scratch_dir = Path(args.scratch_dir) + assert input_path.exists() + + if not output_path.exists(): + os.mkdir(output_path) + + extracted_data = args.scratch_dir + if not args.cache: + extracted_data = extract(input_path, scratch_dir, args.index) + + run( + extracted_data, + os.path.join(output_path, f"afdb2surfer-{args.position}.parquet"), + args.processes, + ) \ No newline at end of file diff --git a/training/alphafind_training/bin/EDTSurf b/training/alphafind_training/bin/EDTSurf new file mode 100755 index 0000000..24061fe Binary files /dev/null and b/training/alphafind_training/bin/EDTSurf differ diff --git a/training/alphafind_training/bin/map2zernike b/training/alphafind_training/bin/map2zernike new file mode 100755 index 0000000..514f114 Binary files /dev/null and b/training/alphafind_training/bin/map2zernike differ diff --git a/training/alphafind_training/bin/obj2grid b/training/alphafind_training/bin/obj2grid new file mode 100755 index 0000000..a4c333b Binary files /dev/null and b/training/alphafind_training/bin/obj2grid differ diff --git a/training/alphafind_training/cluster.py b/training/alphafind_training/cluster.py index 11facb0..7815a83 100644 --- a/training/alphafind_training/cluster.py +++ b/training/alphafind_training/cluster.py @@ -1,6 +1,7 @@ import logging import numpy as np +import pandas as pd import torch from alphafind_training.clustering import run_clustering @@ -12,12 +13,12 @@ np.random.seed(2023) -def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iterations=10): +def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iterations=20): """ Function for clustering the embeddings using K-Means. Args: - input_path (str): Path to the embeddings pickle file or directory of pickle files + input_path (str): Path to the embeddings parquet file or directory of parquet files output_path (str): Path to the output K-Means file n_clusters (int): Number of clusters (default: 2) sample_size (int): Size of the sample (default: 108) @@ -29,10 +30,11 @@ def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iter assert file_exists(input_path) or dir_exists(input_path), 'Input file or directory does not exist' LOG.info('Loading embeddings') + LOG.info('from %s', input_path) if dir_exists(input_path) and not file_exists(input_path): embeddings, _ = load_dataset(input_path, sample_size, shuffle=True) else: - embeddings = load_pickle(input_path) + embeddings = pd.read_parquet(input_path) assert embeddings.shape[0] >= sample_size, 'Sample size must be smaller than the number of embeddings' @@ -53,7 +55,7 @@ def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iter parser = argparse.ArgumentParser(description="Cluster embeddings using K-Means") parser.add_argument( - '--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files' + '--input', type=str, required=True, help='Path to the embeddings parquet file or directory of parquet files' ) parser.add_argument('--output', type=str, required=True, help='Path to the output K-Means file') parser.add_argument('--n-clusters', type=int, default=2, help='Number of clusters') diff --git a/training/alphafind_training/create_buckets.py b/training/alphafind_training/create_buckets.py index 0efbba9..b7bd778 100644 --- a/training/alphafind_training/create_buckets.py +++ b/training/alphafind_training/create_buckets.py @@ -25,16 +25,16 @@ np.random.seed(2023) LOG = logging.getLogger(__name__) -DEFAULT_DIMENSIONALITY = 45 +DEFAULT_DIMENSIONALITY = 121 def load_all_embeddings(path): df = pd.DataFrame([]) - if path.endswith('.pkl'): - return load_pickle(path) + if path.endswith('.parquet'): + return pd.read_parquet(path) else: - for _, emb_file in tqdm(enumerate([f for f in listdir(path) if f.endswith('.pkl')])): - objs = load_pickle(f"{path}/{emb_file}") + for _, emb_file in tqdm(enumerate([f for f in listdir(path) if f.endswith('.parquet')])): + objs = pd.read_parquet(f"{path}/{emb_file}") df = pd.concat([df, objs]) return df diff --git a/training/alphafind_training/create_embedding.py b/training/alphafind_training/create_embedding.py index f65ff26..42d3a72 100644 --- a/training/alphafind_training/create_embedding.py +++ b/training/alphafind_training/create_embedding.py @@ -1,9 +1,10 @@ import argparse import logging import os +import subprocess +import time from multiprocessing import Pool from pathlib import Path -from time import time import numpy as np import pandas as pd @@ -16,16 +17,90 @@ pd.options.mode.chained_assignment = None -DST_THRESHOLD = 20.0 + +def plytoobj(filename): + obj_filename = filename[:-4] + ".obj" + obj_file = open(obj_filename, "w") + + with open(filename) as ply_file: + ply_file_content = ply_file.read().split("\n")[:-1] + + for content in ply_file_content: + content_info = content.split() + if len(content_info) == 6: + vertex_info = "v " + " ".join(content_info[0:3]) + obj_file.write(vertex_info + "\n") + elif len(content_info) == 7: + vertex1, vertex2, vertex3 = map(int, content_info[1:4]) + vertex1, vertex2, vertex3 = vertex1 + 1, vertex2 + 1, vertex3 + 1 + face_info = "f " + str(vertex1) + " " + str(vertex2) + " " + str(vertex3) + obj_file.write(face_info + "\n") + + obj_file.close() + + +def process_protein(it): + start_time = time.time() + + path, protein_file = it + protein_file = str(protein_file) + protein_name = protein_file.split("-")[1] + + # Convert from .cif -> .pdb + subprocess.run(["python3", "./alphafind_training/mmcif_to_pdb.py", "--ciffile", protein_file]) + + # Convert to .ply -> .obj -> .grid -> Zernike descriptors + subprocess.run( + [ + "./alphafind_training/bin/EDTSurf", + "-i", + f"{protein_file}.pdb", + "-h", + "2", + "-f", + "1", + "-o", + f"{path}{protein_name}", + ] + ) + plytoobj(f"{path}{protein_name}.ply") + subprocess.run(["./alphafind_training/bin/obj2grid", "-g", "64", f"{path}{protein_name}.obj"]) + subprocess.run( + [ + "./alphafind_training/bin/map2zernike", + f"{path}{protein_name}.obj.grid", + "-c", + "0.5", + ] + ) + + # Convert to vector + with open(f"{path}{protein_name}.obj.grid.inv") as f: + embedding = [float(x.strip()) for x in f.readlines()[1:]] + + # Clean up + subprocess.run( + [ + "rm", + f"{path}{protein_name}.ply", + f"{path}{protein_name}.obj", + f"{path}{protein_name}.obj.grid", + f"{path}{protein_name}.obj.grid.inv", + f"{path}{protein_name}-cav.pdb", + f"{protein_file}.pdb" + ] + ) + + LOG.info(f"Processed {protein_name} in {time.time() - start_time:.2f} seconds") + return protein_name, embedding -def create_embedding(input_path, output_path, granularity): - """Calculate all protein descriptors +def create_embedding(input_path, output_path): + """Calculate all protein descriptors using the new method Args: - input_path (str or Path): path to CIF directory - output_path (str or Path): output file path - granularity (int): granularity of the descriptors + input_path (str or Path): Path to CIF directory + output_path (str or Path): Path to save embeddings as a parquet file Returns: None @@ -34,7 +109,7 @@ def create_embedding(input_path, output_path, granularity): output_path = Path(output_path) proteins = [file for file in os.listdir(input_path) if file.endswith(".cif")] - LOG.info(f'Found {len(proteins)} proteins to create the embedding for') + LOG.info(f'Found {len(proteins)} proteins to create embeddings for') with Pool() as pool: results = [] @@ -42,168 +117,29 @@ def create_embedding(input_path, output_path, granularity): index = [] for protein in proteins: - result = pool.apply_async(process_protein, (input_path / protein, granularity)) + result = pool.apply_async(process_protein, ((input_path, input_path / protein),)) results.append(result) LOG.info("Processing started") - t = time() - data = [ - n for sublist in [result.get()['data'] for result in tqdm(results, total=len(proteins))] for n in sublist - ] - index = [n for sublist in [result.get()['index'] for result in results] for n in sublist] - df = pd.DataFrame(index=index, data=data) - df.to_pickle(output_path) - t = time() - t - LOG.info(f'Processing took {t:.1f} seconds') - LOG.info(f'Output saved to {output_path}') - + t = time.time() + for result in tqdm(results, total=len(proteins)): + protein_name, embedding = result.get() + index.append(protein_name) + data.append(embedding) -def process_protein(protein, granularity): - """Create protein descriptor from file + df = pd.DataFrame(index=index, data=data, dtype="float32") + # Save as parquet + df.to_parquet(output_path) - Args: - protein (Path): path to protein file - granularity (int): descriptor granularity - fstart (_type_): filename protein id start index - fend (_type_): filename protein id end index - - Returns: - dict: protein chain id and the descriptor - """ - protein_chains = read_and_extract(protein, granularity) - - data_list = [] - index_list = [] - for chain, df in protein_chains: - desc = create_descriptor(df, granularity) - data_list.append(desc) - index_list.append(f"{str(protein).split('/')[-1].split('-')[1].upper()}") - return {'index': index_list, 'data': data_list} - - -def create_descriptor(chain_df, granularity): - """Create protein descriptor from extracted data - - Args: - chain_df (DataFrame): extracted protein data - granularity (int): granularity of the descriptor - """ - - def compute_matrix(row): - dist = np.linalg.norm( - np.array([row['x_x'], row['y_x'], row['z_x']]) - np.array([row['x_y'], row['y_y'], row['z_y']]) - ) - return (DST_THRESHOLD - dist) / DST_THRESHOLD if dist <= DST_THRESHOLD else 0.0 - - chain_df['key'] = 0 - chain_df = chain_df.sort_values('normalized_rs') - chain_df = pd.merge(chain_df, chain_df, on='key', how='left') - chain_df['dist'] = chain_df.apply(lambda row: compute_matrix(row), axis=1) - - chain_df = chain_df.pivot(index='normalized_rs_x', columns='normalized_rs_y', values='dist') - nparray = chain_df.to_numpy(dtype='float16') - shape = nparray.shape[0] - nparray = np.pad(nparray, (0, granularity - shape), "constant") - nup = nparray[np.triu_indices(nparray.shape[0], k=1)] - return nup - - -def read_and_extract(protein_file, granularity): # noqa: C901 - """Extract protein descriptor data from PDB gz file + t = time.time() - t + LOG.info(f'Processing took {t:.1f} seconds') + LOG.info(f'Output saved to {output_path}') - Args: - protein_file (str): path to protein file - granularity (int): descriptor granularity - """ - def remap(n, min_, max_): - if max_ - min_ >= granularity: - return int((n - min_) / (max_ - min_) * (granularity - 1)) + 1 - return n - min_ + 1 - - df = pd.DataFrame(columns=['atom', 'residue', 'chain', 'residue_sequence', 'x', 'y', 'z']) - - atoms = [] - residues = [] - chains = [] - residue_sequences = [] - xs = [] - ys = [] - zs = [] - - with open(protein_file, 'rt') as file: - model = True - for line in file: - words = line.split() - if len(words) == 0: - continue - if model and line[0:4] == "ATOM": - atoms.append(words[3]) - residues.append(words[5]) - chains.append(words[6]) - if words[6] != "A": - print("Chain is not A") - residue_sequences.append(words[8]) - xs.append(words[10]) - ys.append(words[11]) - zs.append(words[12]) - - if len(residue_sequences) == 0: - return [] - - coded_residue_sequences = [] - index = 1 - last = residue_sequences[0] - for rs in residue_sequences: - if rs == last: - coded_residue_sequences.append(index) - else: - index += 1 - coded_residue_sequences.append(index) - last = rs - - df = pd.DataFrame( - { - 'atom': atoms, - 'residue': residues, - 'chain': chains, - 'residue_sequence': coded_residue_sequences, - 'x': xs, - 'y': ys, - 'z': zs, - } - ) - df = df.astype({'residue_sequence': int, 'x': float, 'y': float, 'z': float}) - chains = df['chain'].unique() - tables = [] - for chain in chains: - table = df[df["chain"] == chain] - min_ = np.min(table["residue_sequence"]) - max_ = np.max(table["residue_sequence"]) - table.loc[:, "normalized_rs"] = table.loc[:, "residue_sequence"].apply(lambda x: remap(x, min_, max_)) - table = table.drop(['residue_sequence'], axis=1) - table = table.groupby(['chain', 'normalized_rs']) - table = table[["x", "y", "z"]].mean().reset_index() - table = table.sort_values(['chain', 'normalized_rs']) - tables.append((chain, table)) - return tables - - -""" -Script for creating protein descriptors from CIF files. -Used in AlphaFind to create the protein descriptors used in building an index and fast searching. - -Input: Directory containing CIF files -Output: Pickle file containing the protein descriptors - -EXAMPLE USE: -python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10 -""" if __name__ == "__main__": parser = argparse.ArgumentParser(description="Create protein descriptors from CIF files") parser.add_argument("--input", type=str, required=True, help="Path to the directory containing CIF files") parser.add_argument("--output", type=str, required=True, help="Path to the output file") - parser.add_argument("--granularity", type=int, default=10, help="How detailed should the descriptor be") args = parser.parse_args() @@ -213,4 +149,4 @@ def remap(n, min_, max_): output_path = Path(args.output) assert input_path.exists(), f"Input path {input_path} does not exist" - create_embedding(input_path, output_path, args.granularity) + create_embedding(input_path, output_path) diff --git a/training/alphafind_training/mmcif_to_pdb.py b/training/alphafind_training/mmcif_to_pdb.py new file mode 100644 index 0000000..1f7e79b --- /dev/null +++ b/training/alphafind_training/mmcif_to_pdb.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +""" +Script to convert mmCIF files to PDB format. + +usage: python cif2pdb.py ciffile [pdbfile] + +Requires python BioPython (`pip install biopython`). It should work with recent version of python 2 or 3. + +@author Spencer Bliven +From: https://gist.github.com/sbliven/b7cc2c5305aa3652a75a580ae7c6ce33 +""" + +import argparse +import logging +import sys + +from Bio.PDB import PDBIO +from Bio.PDB.MMCIFParser import MMCIFParser + + +def int_to_chain(i, base=62): + """ + int_to_chain(int,int) -> str + + Converts a positive integer to a chain ID. Chain IDs include uppercase + characters, numbers, and optionally lowercase letters. + + i = a positive integer to convert + base = the alphabet size to include. Typically 36 or 62. + """ + if i < 0: + raise ValueError("positive integers only") + if base < 0 or 62 < base: + raise ValueError("Invalid base") + + quot = int(i) // base + rem = i % base + if rem < 26: + letter = chr(ord("A") + rem) + elif rem < 36: + letter = str(rem - 26) + else: + letter = chr(ord("a") + rem - 36) + if quot == 0: + return letter + else: + return int_to_chain(quot - 1, base) + letter + + +class OutOfChainsError(Exception): + pass + + +def rename_chains(structure): + """Renames chains to be one-letter chains + + Existing one-letter chains will be kept. Multi-letter chains will be truncated + or renamed to the next available letter of the alphabet. + + If more than 62 chains are present in the structure, raises an OutOfChainsError + + Returns a map between new and old chain IDs, as well as modifying the input structure + """ + next_chain = 0 # + # single-letters stay the same + chainmap = {c.id: c.id for c in structure.get_chains() if len(c.id) == 1} + for o in structure.get_chains(): + if len(o.id) != 1: + if o.id[0] not in chainmap: + chainmap[o.id[0]] = o.id + o.id = o.id[0] + else: + c = int_to_chain(next_chain) + while c in chainmap: + next_chain += 1 + c = int_to_chain(next_chain) + if next_chain >= 62: + raise OutOfChainsError() + chainmap[c] = o.id + o.id = c + return chainmap + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert mmCIF to PDB format") + parser.add_argument("--ciffile", help="mmCIF input file") + parser.add_argument( + "--pdbfile", nargs="?", help="PDB output file. Default based on CIF filename" + ) + parser.add_argument( + "-v", + "--verbose", + help="Long messages", + dest="verbose", + default=False, + action="store_true", + ) + args = parser.parse_args() + + logging.basicConfig( + format="%(levelname)s: %(message)s", + level=logging.DEBUG if args.verbose else logging.WARN, + ) + + ciffile = args.ciffile + pdbfile = args.pdbfile or ciffile + ".pdb" + # Not sure why biopython needs this to read a cif file + strucid = ciffile[:4] if len(ciffile) > 4 else "1xxx" + + # Read file + parser = MMCIFParser() + structure = parser.get_structure(strucid, ciffile) + + # rename long chains + try: + chainmap = rename_chains(structure) + except OutOfChainsError: + logging.error("Too many chains to represent in PDB format") + sys.exit(1) + + if args.verbose: + for new, old in chainmap.items(): + if new != old: + logging.info("Renaming chain {0} to {1}".format(old, new)) + + # Write PDB + io = PDBIO() + io.set_structure(structure) + # TODO What happens with large structures? + io.save(pdbfile) diff --git a/training/alphafind_training/model.py b/training/alphafind_training/model.py index 98f76a7..709f60f 100644 --- a/training/alphafind_training/model.py +++ b/training/alphafind_training/model.py @@ -7,6 +7,7 @@ from typing import Tuple import numpy as np +import pandas as pd import torch import torch.nn.functional as nnf import torch.utils.data @@ -204,7 +205,7 @@ def __init__( input_dim, output_dim, loss=torch.nn.CrossEntropyLoss, - lr=0.1, + lr=0.01, model_type='MLP', class_weight=None, ): @@ -284,7 +285,7 @@ def __init__(self, path, chunk, kmeans_index): else: self.path = f'{path}/{os.listdir(path)[chunk]}' self.kmeans_index = kmeans_index - self.data = data_X_to_torch(load_pickle(self.path)) + self.data = data_X_to_torch(pd.read_parquet(self.path)) self.labels = data_y_to_torch(assign_labels(self.kmeans_index, self.data)) def __len__(self): @@ -297,7 +298,7 @@ def __getitem__(self, idx): class LIDatasetPredict(torch.utils.data.Dataset): def __init__(self, path): self.path = path - self.data_pd = load_pickle(self.path) + self.data_pd = pd.read_parquet(self.path) self.data = data_X_to_torch(self.data_pd) def __len__(self): diff --git a/training/alphafind_training/train.py b/training/alphafind_training/train.py index f566c20..06fc3fe 100644 --- a/training/alphafind_training/train.py +++ b/training/alphafind_training/train.py @@ -25,12 +25,12 @@ def train_model( input_path, output_model_dir, kmeans_path, - model='MLP', + model='MLP10', model_path=None, - epochs=10, + epochs=30, n_classes=2, batch_size=32, - dimensionality=45, + dimensionality=121, wandb_project='small-data-training', wandb_entity='protein-db', ): @@ -38,7 +38,7 @@ def train_model( Train a model on the embeddings dataset. Args: - input_path (str): Path to the embeddings pickle file or directory of pickle files + input_path (str): Path to the embeddings parquet file or directory of parquet files output_model_dir (str): Path to the output model directory kmeans_path (str): Path to the k-means model model (str): Model to use (default: 'MLP') @@ -103,7 +103,7 @@ def train_model( if not dir_exists(config.input) and file_exists(config.input): n_chunks = 1 else: - n_chunks = len([f for f in os.listdir(config.input) if f.endswith('.pkl')]) + n_chunks = len([f for f in os.listdir(config.input) if f.endswith('.parquet')]) nn, _ = load_model(config.model_path, config.dimensionality, config.n_classes, config.model) LOG.info(f'Starting training with epochs={config.epochs}, n_chunks={n_chunks}') @@ -152,7 +152,7 @@ def train_model( if __name__ == '__main__': parser = argparse.ArgumentParser(description="Train a model on the embeddings dataset") parser.add_argument( - '--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files' + '--input', type=str, required=True, help='Path to the embeddings parquet file or directory of parquet files' ) parser.add_argument('--output-model-dir', type=str, required=True, help='Path to the output model dir') parser.add_argument('-m', '--model', type=str, default='MLP', help='Model to use') @@ -163,7 +163,7 @@ def train_model( parser.add_argument('-e', '--epochs', type=int, default=10, help='Number of epochs') parser.add_argument('--n-classes', type=int, default=2, help='Number of classes to use') parser.add_argument('--batch-size', type=int, default=32, help='Batch size') - parser.add_argument('--dimensionality', type=int, default=45, help='Number of dimensions of the data') + parser.add_argument('--dimensionality', type=int, default=121, help='Number of dimensions of the data') args = parser.parse_args() diff --git a/training/alphafind_training/utils.py b/training/alphafind_training/utils.py index eaa5690..ab6ab21 100644 --- a/training/alphafind_training/utils.py +++ b/training/alphafind_training/utils.py @@ -103,6 +103,11 @@ def save_pickle(pickle_path, data): pickle.dump(data, f) +def save_parquet(parquet_path, data): + if not file_exists(parquet_path): + data.to_parquet(parquet_path) + + def write_row_to_csv(path, row): import csv @@ -130,24 +135,24 @@ def load_newest_file_in_dir(dir_path): def load_dataset( path: str, chunk_size: int = 10_000_000, - pickle_files_used: list = [], + parquet_files_used: list = [], shuffle: bool = False, ): dataset_path = "/".join(path.split("/")[:-1]) if path.endswith("/") else path - LOG.info(f"Loading pickle files from: {dataset_path}") + LOG.info(f"Loading parquet files from: {dataset_path}") assert os.path.isdir(dataset_path), f"{dataset_path} is not a directory" data = pd.DataFrame([]) - emb_pickles = os.listdir(dataset_path) + emb_parquets = os.listdir(dataset_path) if shuffle: - random.shuffle(emb_pickles) + random.shuffle(emb_parquets) LOG.info(f"Loading chunk_size={chunk_size} proteins.") - for i, pickle_file in enumerate(emb_pickles): - if pickle_file not in pickle_files_used: + for i, parquet_file in enumerate(emb_parquets): + if parquet_file not in parquet_files_used and parquet_file.endswith(".parquet"): if i % 10 == 0: LOG.info(f"Loaded {data.shape[0]} / {chunk_size} proteins.") - data_pd = load_pickle(f"{dataset_path}/{pickle_file}") + data_pd = pd.read_parquet(f"{dataset_path}/{parquet_file}") data = pd.concat([data, data_pd]) - pickle_files_used.append(pickle_file) + parquet_files_used.append(parquet_file) if data.shape[0] >= chunk_size: break - return data, pickle_files_used + return data, parquet_files_used diff --git a/training/requirements.txt b/training/requirements.txt index 3e47df6..015e8dc 100644 --- a/training/requirements.txt +++ b/training/requirements.txt @@ -14,3 +14,5 @@ wandb tqdm h5py torch-summary +pyarrow +biopython \ No newline at end of file diff --git a/training/train_alphafind.py b/training/train_alphafind.py index 16873ae..66e8ae4 100644 --- a/training/train_alphafind.py +++ b/training/train_alphafind.py @@ -11,19 +11,19 @@ def train_alphafind(base_dir, data_dir, models_dir): # 1) Create embeddings create_embedding( - input_path=os.path.join(data_dir, "cifs"), output_path=os.path.join(data_dir, "embedding.pkl"), granularity=10 + input_path=os.path.join(data_dir, "cifs"), output_path=os.path.join(data_dir, "embedding.parquet") ) # 2) Create a K-Means object create_kmeans( - input_path=os.path.join(data_dir, "embedding.pkl"), + input_path=os.path.join(data_dir, "embedding.parquet"), output_path=os.path.join(data_dir, "kmeans.idx"), n_clusters=2, ) # 3) Train a model train_model( - input_path=os.path.join(data_dir, "embedding.pkl"), + input_path=os.path.join(data_dir, "embedding.parquet"), kmeans_path=os.path.join(data_dir, "kmeans.idx"), output_model_dir=models_dir, n_classes=2, @@ -31,7 +31,7 @@ def train_alphafind(base_dir, data_dir, models_dir): # 4) Create bucket-data create_buckets( - input_path=os.path.join(data_dir, "embedding.pkl"), + input_path=os.path.join(data_dir, "embedding.parquet"), model_dir_path=models_dir, output_chunks=os.path.join(data_dir, "chunks"), output_predictions=os.path.join(data_dir, "overall"),