@@ -560,7 +560,7 @@ def get_tei(adata,
560
560
:param normalize_total: Normalize counts per cell prior TEI calculation.
561
561
:param log1p: Logarithmize the data matrix prior TEI calculation.
562
562
:param target_sum: After normalization, each observation (cell) has a total count equal to target_sum.
563
- :param chunk_size: Number of chuns .
563
+ :param chunk_size: Number of chunks .
564
564
:return: Transcriptome evolutionary index (TEI) values.
565
565
566
566
:type adata: AnnData
@@ -729,7 +729,7 @@ def get_pmatrix(adata,
729
729
:param normalize_total: Normalize counts per cell prior TEI calculation.
730
730
:param log1p: Logarithmize the data matrix prior TEI calculation.
731
731
:param target_sum: After normalization, each observation (cell) has a total count equal to target_sum.
732
- :param chunk_size: Number of chuns .
732
+ :param chunk_size: Number of chunks .
733
733
:return: Partial transcriptome evolutionary index (TEI) values.
734
734
735
735
:type adata: AnnData
@@ -811,7 +811,6 @@ def get_pmatrix(adata,
811
811
right = adata .var [kv ][adata .var_names .isin (all_id_age_df_keep_subset_chunks [0 ]['GeneID' ])],
812
812
left_index = True ,
813
813
right_index = True )[kv ]
814
- print (all_var_names_df_chunks [0 ])
815
814
adata_pmatrix .var ['Phylostrata' ] = list (pd .merge (left = pd .DataFrame (adata_pmatrix .var_names .values ,
816
815
columns = ['GeneID' ]),
817
816
right = all_var_names_df_chunks [0 ],
@@ -832,7 +831,8 @@ def get_pstrata(adata,
832
831
standard_scale = None ,
833
832
normalize_total = True ,
834
833
log1p = True ,
835
- target_sum = 1e6 ):
834
+ target_sum = 1e6 ,
835
+ chunk_size = 100000 ):
836
836
"""
837
837
This function computes the partial transcriptome evolutionary index (TEI) values combined for each stratum.
838
838
@@ -869,6 +869,7 @@ def get_pstrata(adata,
869
869
:param normalize_total: Normalize counts per cell prior TEI calculation.
870
870
:param log1p: Logarithmize the data matrix prior TEI calculation.
871
871
:param target_sum: After normalization, each observation (cell) has a total count equal to target_sum.
872
+ :param chunk_size: Number of chunks.
872
873
:return: List of two DataFrame. First DataFrame contains the summed partial TEI values per strata.
873
874
Second DataFrame contains summed partial TEI values divided by the corresponding global TEI value,
874
875
which represent percentage of global TEI per strata.
@@ -886,6 +887,7 @@ def get_pstrata(adata,
886
887
:type normalize_total: bool
887
888
:type log1p: bool
888
889
:type target_sum: float
890
+ :type chunk_size: int
889
891
:rtype: list
890
892
891
893
Example
@@ -928,75 +930,83 @@ def get_pstrata(adata,
928
930
>>> sns.heatmap(packer19_small_pstrata_grouped[1], annot=True, cmap='viridis')
929
931
>>> plt.show()
930
932
"""
931
- var_names_df ,\
932
- id_age_df_keep_subset ,\
933
- adata_counts ,\
934
- var_names_subset ,\
935
- sumx ,\
936
- sumx_recd ,\
937
- ps ,\
938
- psd = _get_psd (adata = adata ,
939
- gene_id = gene_id ,
940
- gene_age = gene_age ,
941
- keep = keep ,
942
- layer = layer ,
943
- normalize_total = normalize_total ,
944
- log1p = log1p ,
945
- target_sum = target_sum )
946
- wmatrix = psd .dot (adata_counts .transpose ()).transpose ()
947
- pmatrix = sumx_recd .dot (wmatrix )
948
- tei = pmatrix .sum (1 )
949
- phylostrata = list (set (id_age_df_keep_subset ['Phylostrata' ]))
950
- pstrata_norm_by_sumx = np .zeros ((len (phylostrata ), pmatrix .shape [0 ]))
951
- pstrata_norm_by_pmatrix_sum = np .zeros ((len (phylostrata ), pmatrix .shape [0 ]))
952
- for pk_idx , pk in enumerate (phylostrata ):
953
- pstrata_norm_by_sumx [pk_idx , ] = np .array (pmatrix [:, id_age_df_keep_subset ['Phylostrata' ].isin ([pk ])]
954
- .sum (1 )).flatten ()
955
- pstrata_norm_by_pmatrix_sum [pk_idx , ] = np .array (pmatrix [:, id_age_df_keep_subset ['Phylostrata' ]
956
- .isin ([pk ])].sum (1 ) / tei ).flatten ()
957
- pstrata_norm_by_sumx_df = pd .DataFrame (pstrata_norm_by_sumx )
958
- pstrata_norm_by_sumx_df ['ps' ] = phylostrata
959
- pstrata_norm_by_sumx_df .set_index ('ps' ,
960
- inplace = True )
961
- pstrata_norm_by_sumx_df .columns = adata .obs_names
962
- pstrata_norm_by_pmatrix_sum_df = pd .DataFrame (pstrata_norm_by_pmatrix_sum )
963
- pstrata_norm_by_pmatrix_sum_df ['ps' ] = phylostrata
964
- pstrata_norm_by_pmatrix_sum_df .set_index ('ps' ,
965
- inplace = True )
966
- pstrata_norm_by_pmatrix_sum_df .columns = adata .obs_names
967
- if cumsum :
968
- pstrata_norm_by_sumx_df = pstrata_norm_by_sumx_df .cumsum (0 )
969
- pstrata_norm_by_pmatrix_sum_df = pstrata_norm_by_pmatrix_sum_df .cumsum (0 )
970
- if group_by_obs is not None :
971
- if adata .obs [group_by_obs ].dtype .name == 'category' :
972
- obs_group_nan = pd .DataFrame (adata .obs [group_by_obs ])
973
- else :
974
- obs_group_nan = pd .DataFrame (adata .obs [group_by_obs ].fillna (obs_fillna ))
975
- if obs_type == 'mean' :
976
- pstrata_norm_by_sumx_df = \
977
- pstrata_norm_by_sumx_df .transpose ().groupby (obs_group_nan [group_by_obs ]).mean ().transpose ()
978
- pstrata_norm_by_pmatrix_sum_df = \
979
- pstrata_norm_by_pmatrix_sum_df .transpose ().groupby (obs_group_nan [group_by_obs ]).mean ().transpose ()
980
- if obs_type == 'median' :
981
- pstrata_norm_by_sumx_df = \
982
- pstrata_norm_by_sumx_df .transpose ().groupby (obs_group_nan [group_by_obs ]).median ().transpose ()
983
- pstrata_norm_by_pmatrix_sum_df = \
984
- pstrata_norm_by_pmatrix_sum_df .transpose ().groupby (obs_group_nan [group_by_obs ]).median ().transpose ()
985
- if obs_type == 'sum' :
986
- pstrata_norm_by_sumx_df = \
987
- pstrata_norm_by_sumx_df .transpose ().groupby (obs_group_nan [group_by_obs ]).sum ().transpose ()
988
- pstrata_norm_by_pmatrix_sum_df = \
989
- pstrata_norm_by_pmatrix_sum_df .transpose ().groupby (obs_group_nan [group_by_obs ]).sum ().transpose ()
990
- if obs_type == 'min' :
991
- pstrata_norm_by_sumx_df = \
992
- pstrata_norm_by_sumx_df .transpose ().groupby (obs_group_nan [group_by_obs ]).min ().transpose ()
993
- pstrata_norm_by_pmatrix_sum_df = \
994
- pstrata_norm_by_pmatrix_sum_df .transpose ().groupby (obs_group_nan [group_by_obs ]).min ().transpose ()
995
- if obs_type == 'max' :
996
- pstrata_norm_by_sumx_df = \
997
- pstrata_norm_by_sumx_df .transpose ().groupby (obs_group_nan [group_by_obs ]).max ().transpose ()
998
- pstrata_norm_by_pmatrix_sum_df = \
999
- pstrata_norm_by_pmatrix_sum_df .transpose ().groupby (obs_group_nan [group_by_obs ]).max ().transpose ()
933
+ adata_pstrata_norm_by_sumx_df_chunks = []
934
+ adata_pstrata_norm_by_pmatrix_sum_df_chunks = []
935
+ for i in range (0 , adata .shape [0 ], chunk_size ):
936
+ adata_subset = adata [i :i + chunk_size ]
937
+ var_names_df_chunk ,\
938
+ id_age_df_keep_subset_chunk ,\
939
+ adata_counts_chunk ,\
940
+ var_names_subset_chunk ,\
941
+ sumx_chunk ,\
942
+ sumx_recd_chunk ,\
943
+ ps_chunk ,\
944
+ psd_chunk = _get_psd (adata = adata_subset ,
945
+ gene_id = gene_id ,
946
+ gene_age = gene_age ,
947
+ keep = keep ,
948
+ layer = layer ,
949
+ normalize_total = normalize_total ,
950
+ log1p = log1p ,
951
+ target_sum = target_sum )
952
+ wmatrix_chunk = psd_chunk .dot (adata_counts_chunk .transpose ()).transpose ()
953
+ pmatrix_chunk = sumx_recd_chunk .dot (wmatrix_chunk )
954
+ tei_chunk = pmatrix_chunk .sum (1 )
955
+ phylostrata_chunk = list (set (id_age_df_keep_subset_chunk ['Phylostrata' ]))
956
+ pstrata_norm_by_sumx_chunk = np .zeros ((len (phylostrata_chunk ), pmatrix_chunk .shape [0 ]))
957
+ pstrata_norm_by_pmatrix_sum_chunk = np .zeros ((len (phylostrata_chunk ), pmatrix_chunk .shape [0 ]))
958
+ for pk_idx , pk in enumerate (phylostrata_chunk ):
959
+ pstrata_norm_by_sumx_chunk [pk_idx , ] = np .array (pmatrix_chunk [:, id_age_df_keep_subset_chunk ['Phylostrata' ].isin ([pk ]).values ]
960
+ .sum (1 )).flatten ()
961
+ pstrata_norm_by_pmatrix_sum_chunk [pk_idx , ] = np .array (pmatrix_chunk [:, id_age_df_keep_subset_chunk ['Phylostrata' ]
962
+ .isin ([pk ]).values ].sum (1 ) / tei_chunk ).flatten ()
963
+ pstrata_norm_by_sumx_df_chunk = pd .DataFrame (pstrata_norm_by_sumx_chunk )
964
+ pstrata_norm_by_sumx_df_chunk ['ps' ] = phylostrata_chunk
965
+ pstrata_norm_by_sumx_df_chunk .set_index ('ps' ,
966
+ inplace = True )
967
+ pstrata_norm_by_sumx_df_chunk .columns = adata_subset .obs_names
968
+ pstrata_norm_by_pmatrix_sum_df_chunk = pd .DataFrame (pstrata_norm_by_pmatrix_sum_chunk )
969
+ pstrata_norm_by_pmatrix_sum_df_chunk ['ps' ] = phylostrata_chunk
970
+ pstrata_norm_by_pmatrix_sum_df_chunk .set_index ('ps' ,
971
+ inplace = True )
972
+ pstrata_norm_by_pmatrix_sum_df_chunk .columns = adata_subset .obs_names
973
+ if cumsum :
974
+ pstrata_norm_by_sumx_df_chunk = pstrata_norm_by_sumx_df_chunk .cumsum (0 )
975
+ pstrata_norm_by_pmatrix_sum_df_chunk = pstrata_norm_by_pmatrix_sum_df_chunk .cumsum (0 )
976
+ if group_by_obs is not None :
977
+ if adata_subset .obs [group_by_obs ].dtype .name == 'category' :
978
+ obs_group_nan = pd .DataFrame (adata_subset .obs [group_by_obs ])
979
+ else :
980
+ obs_group_nan = pd .DataFrame (adata_subset .obs [group_by_obs ].fillna (obs_fillna ))
981
+ if obs_type == 'mean' :
982
+ pstrata_norm_by_sumx_df_chunk = \
983
+ pstrata_norm_by_sumx_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).mean ().transpose ()
984
+ pstrata_norm_by_pmatrix_sum_df_chunk = \
985
+ pstrata_norm_by_pmatrix_sum_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).mean ().transpose ()
986
+ if obs_type == 'median' :
987
+ pstrata_norm_by_sumx_df_chunk = \
988
+ pstrata_norm_by_sumx_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).median ().transpose ()
989
+ pstrata_norm_by_pmatrix_sum_df_chunk = \
990
+ pstrata_norm_by_pmatrix_sum_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).median ().transpose ()
991
+ if obs_type == 'sum' :
992
+ pstrata_norm_by_sumx_df_chunk = \
993
+ pstrata_norm_by_sumx_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).sum ().transpose ()
994
+ pstrata_norm_by_pmatrix_sum_df_chunk = \
995
+ pstrata_norm_by_pmatrix_sum_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).sum ().transpose ()
996
+ if obs_type == 'min' :
997
+ pstrata_norm_by_sumx_df_chunk = \
998
+ pstrata_norm_by_sumx_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).min ().transpose ()
999
+ pstrata_norm_by_pmatrix_sum_df_chunk = \
1000
+ pstrata_norm_by_pmatrix_sum_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).min ().transpose ()
1001
+ if obs_type == 'max' :
1002
+ pstrata_norm_by_sumx_df_chunk = \
1003
+ pstrata_norm_by_sumx_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).max ().transpose ()
1004
+ pstrata_norm_by_pmatrix_sum_df_chunk = \
1005
+ pstrata_norm_by_pmatrix_sum_df_chunk .transpose ().groupby (obs_group_nan [group_by_obs ]).max ().transpose ()
1006
+ adata_pstrata_norm_by_sumx_df_chunks .append (pstrata_norm_by_sumx_df_chunk )
1007
+ adata_pstrata_norm_by_pmatrix_sum_df_chunks .append (pstrata_norm_by_pmatrix_sum_df_chunk )
1008
+ pstrata_norm_by_sumx_df = pd .concat (adata_pstrata_norm_by_sumx_df_chunks , axis = 1 )
1009
+ pstrata_norm_by_pmatrix_sum_df = pd .concat (adata_pstrata_norm_by_pmatrix_sum_df_chunks , axis = 1 )
1000
1010
if standard_scale is not None :
1001
1011
if standard_scale == 0 :
1002
1012
pstrata_norm_by_sumx_df = pstrata_norm_by_sumx_df .apply (_min_max_to_01 ,
@@ -1054,7 +1064,8 @@ def get_ematrix(adata,
1054
1064
standard_scale = None ,
1055
1065
normalize_total = True ,
1056
1066
log1p = True ,
1057
- target_sum = 1e6 ):
1067
+ target_sum = 1e6 ,
1068
+ chunk_size = 100000 ):
1058
1069
"""
1059
1070
This function computes expression profiles for all genes or group of genes 'group_by_var' (default: None).
1060
1071
@@ -1107,6 +1118,7 @@ def get_ematrix(adata,
1107
1118
:param normalize_total: Normalize counts per cell.
1108
1119
:param log1p: Logarithmize the data matrix.
1109
1120
:param target_sum: After normalization, each observation (cell) has a total count equal to target_sum.
1121
+ :param chunk_size: Number of chunks.
1110
1122
:return: Expression profile DataFrame.
1111
1123
1112
1124
:type adata: AnnData
@@ -1121,6 +1133,7 @@ def get_ematrix(adata,
1121
1133
:type normalize_total: bool
1122
1134
:type log1p: bool
1123
1135
:type target_sum: float
1136
+ :type chunk_size: int
1124
1137
:rtype: pandas.DataFrame
1125
1138
1126
1139
Example
@@ -1312,7 +1325,8 @@ def get_rematrix(adata,
1312
1325
standard_scale = None ,
1313
1326
normalize_total = True ,
1314
1327
log1p = True ,
1315
- target_sum = 1e6 ):
1328
+ target_sum = 1e6 ,
1329
+ chunk_size = 100000 ):
1316
1330
"""
1317
1331
This function computes relative expression profiles.
1318
1332
@@ -1364,6 +1378,7 @@ def get_rematrix(adata,
1364
1378
:param normalize_total: Normalize counts per cell prior TEI calculation.
1365
1379
:param log1p: Logarithmize the data matrix prior TEI calculation.
1366
1380
:param target_sum: After normalization, each observation (cell) has a total count equal to target_sum.
1381
+ :param chunk_size: Number of chunks.
1367
1382
:return: Relative expression profile DataFrame.
1368
1383
1369
1384
:type adata: AnnData
@@ -1380,6 +1395,7 @@ def get_rematrix(adata,
1380
1395
:type normalize_total: bool
1381
1396
:type log1p: bool
1382
1397
:type target_sum: float
1398
+ :type chunk_size: int
1383
1399
:rtype: pandas.DataFrame
1384
1400
1385
1401
Example
@@ -1695,7 +1711,8 @@ def mergeby_from_counts(adata,
1695
1711
max_expr = None ,
1696
1712
normalize_total = False ,
1697
1713
log1p = False ,
1698
- target_sum = 1e6 ):
1714
+ target_sum = 1e6 ,
1715
+ chunk_size = 100000 ):
1699
1716
"""
1700
1717
This function groups all counts of an existing AnnData object as an array based on variable or observation groups.
1701
1718
The resulting pandas.DataFrame can be used to e.g. apply statistics or visualize the groups more easily.
@@ -1712,6 +1729,7 @@ def mergeby_from_counts(adata,
1712
1729
:param normalize_total: Normalize counts per cell.
1713
1730
:param log1p: Logarithmize the data matrix.
1714
1731
:param target_sum: After normalization, each observation (cell) has a total count equal to target_sum.
1732
+ :param chunk_size: Number of chunks.
1715
1733
:return: List of three DataFrame. First DataFrame contains the grouped data (each cell contains a numpy.ndarray).
1716
1734
Second DataFrame contains original variable and observation assignment and groupings.
1717
1735
@@ -1727,6 +1745,7 @@ def mergeby_from_counts(adata,
1727
1745
:type normalize_total: bool
1728
1746
:type log1p: bool
1729
1747
:type target_sum: float
1748
+ :type chunk_size: int
1730
1749
:rtype: list
1731
1750
1732
1751
Example
@@ -1926,6 +1945,7 @@ def get_e50(adata,
1926
1945
:param normalize_total:
1927
1946
:param log1p:
1928
1947
:param target_sum:
1948
+ :param chunk_size: Number of chunks.
1929
1949
:param min_expr:
1930
1950
:param max_expr:
1931
1951
:return:
0 commit comments