Skip to content

Commit

Permalink
Bug in tests fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
Old-Shatterhand committed Apr 25, 2024
1 parent d3aacc4 commit d699dc5
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 31 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,12 @@ on:
branches:
- main
- dev
- dev_1.0
- dev_1.0_weighting
- dev_1.1.0
pull_request:
branches:
- main
- dev
- dev_1.0
- dev_1.0_weighting
- dev_1.1.0
workflow_dispatch: # make is manually start-able

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
Expand Down
3 changes: 2 additions & 1 deletion datasail/reader/read_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ def read_dir(ds: DataSet, path: Path) -> None:
read_data_input(data, dataset, read_dir)

dataset = read_data(weights, strats, sim, dist, inter, index, num_clusters, tool_args, dataset)
dataset = remove_duplicate_values(dataset, dataset.data)
if dataset.data is not None:
dataset = remove_duplicate_values(dataset, dataset.data)
return dataset
3 changes: 1 addition & 2 deletions datasail/reader/read_molecules.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def read_molecule_data(
index: Optional[int] = None,
num_clusters: Optional[int] = None,
tool_args: str = "",
detect_duplicates: bool = True,
) -> DataSet:
"""
Read in molecular data, compute the weights, and distances or similarities of every entity.
Expand Down Expand Up @@ -80,7 +79,7 @@ def read_dir(ds: DataSet, path: Path) -> None:
read_data_input(data, dataset, read_dir)

dataset = read_data(weights, strats, sim, dist, inter, index, num_clusters, tool_args, dataset)
if detect_duplicates:
if dataset.data is not None:
dataset = remove_molecule_duplicates(dataset)

return dataset
Expand Down
3 changes: 2 additions & 1 deletion datasail/reader/read_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def read_dir(ds: DataSet, path: Path) -> None:
read_data_input(data, dataset, read_dir)

dataset = read_data(weights, strats, sim, dist, inter, index, num_clusters, tool_args, dataset)
dataset = remove_duplicate_values(dataset, dataset.data)
if dataset.data is not None:
dataset = remove_duplicate_values(dataset, dataset.data)

return dataset
3 changes: 2 additions & 1 deletion datasail/reader/read_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def read_dir(ds: DataSet, path: Path) -> None:
dataset.format = FORM_PDB if str(next(iter(dataset.data.values()))).endswith(".pdb") else FORM_FASTA

dataset = read_data(weights, strats, sim, dist, inter, index, num_clusters, tool_args, dataset)
dataset = remove_duplicate_values(dataset, dataset.data)
if dataset.data is not None:
dataset = remove_duplicate_values(dataset, dataset.data)

return dataset

Expand Down
50 changes: 33 additions & 17 deletions datasail/reader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

DATA_INPUT = Optional[Union[str, Path, Dict[str, Union[str, np.ndarray]],
Callable[..., Dict[str, Union[str, np.ndarray]]], Generator[Tuple[str, Union[str, np.ndarray]], None, None]]]
MATRIX_INPUT = Optional[Union[str, Path, Tuple[List[str], np.ndarray], Callable[..., Tuple[List[str], np.ndarray]]]]
MATRIX_INPUT = Optional[Union[str, Path, np.ndarray, Tuple[np.ndarray, List[str]], Callable[..., Tuple[np.ndarray, Optional[List[str]]]]]]
DictMap = Dict[str, List[Dict[str, str]]]


Expand Down Expand Up @@ -63,7 +63,10 @@ def __hash__(self) -> int:
elif isinstance(obj, Namespace):
hv = hash(tuple(obj.__dict__.items()))
else:
hv = hash(obj)
try:
hv = hash(obj)
except TypeError:
hv = 0
hash_val ^= hv
return hash_val

Expand Down Expand Up @@ -178,7 +181,7 @@ def count_inter(inter: List[Tuple[str, str]], mode: int) -> Generator[Tuple[str,
yield key, tmp[mode].count(key)


def read_clustering_file(filepath: Path, sep: str = "\t") -> Tuple[List[str], np.ndarray]:
def read_clustering_file(filepath: Path, sep: str = "\t") -> Tuple[np.ndarray, Optional[List[str]]]:
"""
Read a similarity or distance matrix from a file.
Expand All @@ -189,14 +192,16 @@ def read_clustering_file(filepath: Path, sep: str = "\t") -> Tuple[List[str], np
Returns:
A list of names of the entities and their pairwise interactions in and numpy array
"""
names = []
measures = []
measures, names = [], []
with open(filepath, "r") as data:
for line in data.readlines()[1:]:
names_given = not data.readlines()[0].split(sep)[1].replace('.','',1).isdigit()
data.seek(0) # Reset filepointer to beginning
for line in data.readlines()[(1 if names_given else 0):]:
parts = line.strip().split(sep)
names.append(parts[0])
measures.append([float(x) for x in parts[1:]])
return names, np.array(measures)
if names_given:
names.append(parts[0])
measures.append([float(x) for x in parts[(1 if names_given else 0):]])
return np.array(measures), names


def read_csv(filepath: Path, sep: str = ",") -> Generator[Tuple[str, str], None, None]:
Expand All @@ -217,7 +222,7 @@ def read_csv(filepath: Path, sep: str = ",") -> Generator[Tuple[str, str], None,

def read_matrix_input(
in_data: MATRIX_INPUT,
) -> Tuple[List[str], Union[np.ndarray, str]]:
) -> Tuple[np.ndarray, Optional[List[str]]]:
"""
Read the data from different types of similarity or distance.
Expand All @@ -228,17 +233,20 @@ def read_matrix_input(
Tuple of names of the data samples and a matrix holding their similarities/distances or a string encoding a
method to compute the fore-mentioned
"""
names = None
if isinstance(in_data, str):
in_data = Path(in_data)
if isinstance(in_data, Path) and in_data.is_file():
names, similarity = read_clustering_file(in_data)
matrix, names = read_clustering_file(in_data)
elif isinstance(in_data, np.ndarray):
matrix = in_data
elif isinstance(in_data, tuple):
names, similarity = in_data
matrix, names = in_data
elif isinstance(in_data, Callable):
names, similarity = in_data()
matrix = in_data()
else:
raise ValueError()
return names, similarity
return matrix, names


def read_data(
Expand Down Expand Up @@ -295,18 +303,26 @@ def read_data(
# parse the protein similarity measure
if sim is None and dist is None:
dataset.similarity, dataset.distance = get_default(dataset.type, dataset.format)
dataset.names = list(dataset.data.keys())
elif sim is not None and not (isinstance(sim, str) and sim.lower() in SIM_ALGOS):
dataset.names, dataset.similarity = read_matrix_input(sim)
dataset.similarity, names = read_matrix_input(sim)
if names is not None:
dataset.names = names
elif dist is not None and not (isinstance(dist, str) and dist.lower() in DIST_ALGOS):
dataset.names, dataset.distance = read_matrix_input(dist)
dataset.distance, names = read_matrix_input(dist)
if names is not None:
dataset.names = names
else:
if sim is not None:
dataset.similarity = sim
else:
dataset.distance = dist
dataset.names = list(dataset.data.keys())

if dataset.data is not None and dataset.names is None:
dataset.names = list(dataset.data.keys())
elif dataset.similarity is None and dataset.distance is None:
raise ValueError("No data provided for splitting.")

dataset.args = validate_user_args(dataset.type, dataset.format, sim, dist, tool_args)

return dataset
Expand Down
1 change: 0 additions & 1 deletion datasail/routine.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def datasail_main(**kwargs) -> Tuple[Dict, Dict, Dict]:
f_dataset.id_map.get(f, ""), NOT_ASSIGNED)
else:
raise ValueError()

LOGGER.info("BQP splitting finished and results stored.")
LOGGER.info(f"Total runtime: {time.time() - start:.5f}s")

Expand Down
5 changes: 3 additions & 2 deletions tests/test_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ def test_caching(size):
num_clusters=50,
)

original_dataset = cluster(dataset, num_clusters=50, output=base / "splits", threads=1, log_dir=Path("log.txt"),
linkage="average")
original_dataset = cluster(
dataset, num_clusters=50, output=base / "splits", threads=1, log_dir=Path("log.txt"), linkage="average"
)

# test caching
store_to_cache(dataset, **{"cache": True, "cache_dir": Path("test_cache")})
Expand Down
4 changes: 2 additions & 2 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,10 +381,10 @@ def test_c2_impossible():
names=["train", "test"],
e_type="P",
e_data={str(i): "A" * (i + 1) for i in range(10)},
e_sim=([str(i) for i in range(10)], sims),
e_sim=sims,
e_clusters=2,
f_type="P",
f_sim=([str(i) for i in range(10)], sims),
f_sim=sims,
f_data={str(i): "A" * (i + 1) for i in range(10)},
f_clusters=2,
)
Expand Down

0 comments on commit d699dc5

Please sign in to comment.