-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtokenize_compare_sequences.py
120 lines (92 loc) · 4.47 KB
/
tokenize_compare_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
This script tokenizes the sequences in the sequence_metrics.csv file and compares the
sequences to each other using cosine similarity.
"""
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
# if not installed, install tensorflow, keras and scikit-learn
# with os module, you can run terminal commands from python
# os.system("pip install tensorflow")
# os.system("pip install keras")
# os.system("pip install scikit-learn")
# change the working directory to the directory where the sequence_metrics.csv file is located
os.chdir("updated_data")
def cosine_simulation(csv_file, column, output_file):
"""
compares the sequences in the sequence_metrics.csv file using cosine similarity
on the character level, 4-mer level, 5-mer level and 6-mer level
Parameters
----------
csv_file : csv file with the sequence metrics
column : column in the csv file with the sequences
output_file : name of the output file
Returns
-------
csv file with the cosine similarity scores
Example
-------
>>> cosine_simulation("sequence_metrics.csv", "sequence",
"cosine_similarity_characterlvl.csv")
"""
# load the sequence
df = pd.read_csv(csv_file)
# label encode the sequences in the sequence column & identifier column
# label encoding is used to convert the characters in the sequence column to integers
# copy the original accession id column to a new column called accession_id "]
df2 = df["accession_id"].copy()
df["accession_id"] = LabelEncoder().fit_transform(df["accession_id"])
# label encoding the sequences in the sequence column
df["sequence_split"] = df[column].apply(
lambda x: LabelEncoder().fit_transform(list(x))
)
# padding the onehot encoded characters to the same length as the longest sequence in the dataset
padded = pad_sequences(df["sequence_split"], padding="post")
# adding the padded sequences to the dataframe
df["padded"] = padded.tolist()
# cosine similarity: a measure of similarity between two non-zero vectors of an inner product space
cosine_sim_list = cosine_similarity(df["padded"].tolist())
# get the average cosine similarity score for each sequence
# why? because cosine similarity returns a matrix where each row represents the similarity of one sequence to all other sequences
cosine_sim_list = cosine_sim_list.mean(axis=1)
# converting the cosine similarity matrix to a dataframe and adding the accession id as the index
cosine_similarity_df = pd.DataFrame(cosine_sim_list, index=df["accession_id"])
# resetting the index
cosine_similarity_df = cosine_similarity_df.reset_index()
# renaming column labeled 0 to cosine_similarity average
cosine_similarity_df = cosine_similarity_df.rename(
columns={0: "cosine_similarity_average"}
)
# adding the accession id column back to the dataframe
cosine_similarity_df["accession_id"] = df2
# add encoded column back to the dataframe:
# call the column label encoded column
cosine_similarity_df["label_encoded"] = df["accession_id"]
# remove duplicate rows
cosine_similarity_df = cosine_similarity_df.drop_duplicates("accession_id")
# saving the cosine similarity matrix as a csv file
return cosine_similarity_df.to_csv(output_file)
if __name__ == "__main__":
# full character sequence comparison
cosine_simulation(
"sequence_metrics.csv", "sequence", "cosine_similarity_characterlvl.csv"
)
# 4-mer sequence comparison
cosine_simulation("sequence_metrics.csv", "4mers", "cosine_similarity_4_mer.csv")
# 5-mer sequence comparison
cosine_simulation("sequence_metrics.csv", "5mers", "cosine_similarity_5_mer.csv")
# 6-mer sequence comparison
cosine_simulation("sequence_metrics.csv", "6mers", "cosine_similarity_6_mer.csv")
# 9-mer sequence comparison
cosine_simulation("sequence_metrics.csv", "9mers", "cosine_similarity_9_mer.csv")
# how to interpret the cosine similarity scores
# 0.0 - 0.2 : low similarity
# 0.2 - 0.4 : moderate similarity
# 0.4 - 0.6 : high similarity
# 0.6 - 0.8 : very high similarity
# 0.8 - 1.0 : identical sequences
# next steps:
# you could use this for the clustering analysis to see how the sequences are related to each other
# you could also use this for the classification analysis to see if the sequences are similar to each other