diff --git a/eda.smk b/eda.smk index 1b0303b..fea354d 100644 --- a/eda.smk +++ b/eda.smk @@ -34,7 +34,7 @@ rule eda_msa: mv_problems() # list of (path to) msas - list_paths = list(Path(config["PATH_MSAS"]).rglob("*.fa")) + list_paths = list(Path(config["PATH_MSAS"]).rglob("*.[fa]*")) with ThreadPoolExecutor(max_workers=16) as pool: with tqdm(total=len(list_paths)) as progress: diff --git a/pangeblock.smk b/pangeblock.smk index 753f7c6..4055443 100644 --- a/pangeblock.smk +++ b/pangeblock.smk @@ -11,7 +11,7 @@ STATS_MSAS = pd.read_csv( index_col=False, sep="\t" ) NAMES = STATS_MSAS["path_msa"].apply(lambda path: Path(path).stem) # txt file with names of the MSAs - +NAMES = STATS_MSAS[["path_msa","n_seqs"]].query("n_seqs>1")["path_msa"].apply(lambda path: Path(path).stem) # --- rule all: @@ -27,7 +27,7 @@ rule all: rule compute_blocks: input: - expand("{path_msas}/{{name_msa}}.fa", path_msas=PATH_MSAS) + expand("{path_msas}/{{name_msa}}.fasta", path_msas=PATH_MSAS) output: expand("{path_output}/max_blocks/{{name_msa}}.json", path_output=PATH_OUTPUT) log: @@ -62,7 +62,7 @@ rule decompose_blocks: rule pangeblock: input: path_blocks=expand("{path_output}/block_decomposition/{{name_msa}}.json", path_output=PATH_OUTPUT), - path_msa=expand("{path_msas}/{{name_msa}}.fa", path_msas=PATH_MSAS) + path_msa=expand("{path_msas}/{{name_msa}}.fasta", path_msas=PATH_MSAS) output: path_gfa=expand("{path_output}/gfa/{{name_msa}}.gfa", path_output=PATH_OUTPUT), path_oc=expand("{path_output}/opt-coverage/{{name_msa}}.json", path_output=PATH_OUTPUT) @@ -92,7 +92,7 @@ rule bandage_labels: rule coverage: input: path_blocks=expand("{path_output}/block_decomposition/{{name_msa}}.json", path_output=PATH_OUTPUT), - path_msa=expand("{path_msa}/{{name_msa}}.fa", path_msa=PATH_MSAS) + path_msa=expand("{path_msa}/{{name_msa}}.fasta", path_msa=PATH_MSAS) output: path_gray=expand("{path_output}/coverage/{{name_msa}}-gray.jpg", path_output=PATH_OUTPUT), path_color=expand("{path_output}/coverage/{{name_msa}}-color.jpg", path_output=PATH_OUTPUT) diff --git a/params.yaml b/params.yaml index b2ebb6a..3cc4d54 100644 --- a/params.yaml +++ b/params.yaml @@ -1,7 +1,7 @@ -PATH_MSAS: "msa-difficile" -PATH_OUTPUT: "output-msa-difficile-weighted" +PATH_MSAS: "/data/pangeblocks-experiments/data/msas-dna" +PATH_OUTPUT: "/data/pangeblocks-experiments/output/gfa/pangeblocks-strings" OPTIMIZATION: - OBJECTIVE_FUNCTION: "weighted" # nodes, strings, weighted + OBJECTIVE_FUNCTION: "strings" # nodes, strings, weighted PENALIZATION: 3 # used only with "weighted" MIN_LEN: 2 # used only with "weighted" LOG_LEVEL: "INFO" \ No newline at end of file diff --git a/src/eda_msas.py b/src/eda_msas.py index 6941265..b23d2e7 100644 --- a/src/eda_msas.py +++ b/src/eda_msas.py @@ -1,42 +1,53 @@ -import sys -import time - from pathlib import Path -from Bio import AlignIO #load MSA -from utils.monitor_values import MonitorValues -from rich.progress import track - -def run(path_msas: str): - - out_file = Path("out/stats_msas.tsv") - out_file.parent.mkdir(exist_ok=True, parents=True) - problematic_out_file = Path("out/problematic_files.tsv") +from concurrent.futures import ThreadPoolExecutor +from src.utils import MonitorValuesPlus +from src.msa import AnalyzerMSA +from tqdm import tqdm +import argparse - # log values - mv_msas = MonitorValues(["path_msa","n_seqs","n_unique_seqs","n_cols"]) - mv_msas_problems = MonitorValues(["path_msa"]) +def main(path_msas: str, path_output: str): - list_msa=list(Path(path_msas).rglob("*fa")) - for path_msa in track(list_msa, description="Working on MSAs"): + mv = MonitorValuesPlus(list_vars=["path_msa","n_seqs","n_unique_seqs","n_identical_cols","n_cols", "perc_identical_cols"], + out_file=f"{path_output}/analysis-msa/stats_msas.tsv", + overwrite=True) + mv_problems = MonitorValuesPlus(list_vars=["path_msa"], + out_file=f"{path_output}/analysis-msa/problematic_files.tsv", + overwrite=True) + + analyzer = AnalyzerMSA() + def run(path_msa): + "Function to run with ThreadPoolExecutor" try: - # load MSA, count seqs and columns - align=AlignIO.read(path_msa, "fasta") - n_cols = align.get_alignment_length() - n_seqs = len(align) - n_unique_seqs = len(set([str(record.seq) for record in align])) + n_cols, n_seqs, n_unique_seqs, n_identical_cols = analyzer(path_msa) + perc_identical_cols = round(100*n_identical_cols/n_cols, 2) + mv() + except: + mv_problems() - mv_msas() - except: - mv_msas_problems() + # list of (path to) msas + list_paths = list(Path(path_msas).rglob("*.[fa]*")) + + with ThreadPoolExecutor(max_workers=16) as pool: + with tqdm(total=len(list_paths)) as progress: + progress.set_description("Analyzing MSA") + futures=[] + for path_msa in list_paths: + + future = pool.submit(run, path_msa) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + for future in futures: + future.result() - df = mv_msas.get_values_asdf() - df.sort_values(by=["n_cols","n_seqs"], inplace=True) - df.to_csv(out_file,sep="\t") - mv_msas_problems.get_values_asdf().to_csv(problematic_out_file, sep="\t") if __name__ == "__main__": - # TODO: use argparse - path_msas = sys.argv[-1] - run(path_msas) \ No newline at end of file + # Command line options + parser = argparse.ArgumentParser() + parser.add_argument("path_msas", help="directory to MSAs") + parser.add_argument("path_output", help="output directory") + args = parser.parse_args() + + main(args.path_msas, args.path_output) \ No newline at end of file diff --git a/src/ilp/optimization.py b/src/ilp/optimization.py index b1b554d..b19d7d6 100644 --- a/src/ilp/optimization.py +++ b/src/ilp/optimization.py @@ -142,6 +142,9 @@ def __call__(self, return_times: bool = False): c_variables = list(disjoint_vertical) + [item["idx"] for item in vertical_blocks.values()] + list(range(first_private_block, len(all_blocks))) + for idx, cvar in enumerate(c_variables): + logging.debug(f"c variables: idx {idx}, value {cvar}") + # msa_positions is a list of all positions (r,c) that are required to be # covered. We exclude the positions covered by vertical blocks, since # they will be guaranteed to be covered, as an effect of the fact that @@ -205,12 +208,12 @@ def __call__(self, return_times: bool = False): tf = time.time() times["constraints1-2-3"] = round(tf - ti, 3) - # constraint 4: vertical blocks are part of the solution - for idx in vertical_blocks: - model.addConstr(C[idx] == 1) - logging.info( - f"constraint4: vertical block ({idx}) - {self.input_blocks[idx].str()}" - ) + # # constraint 4: vertical blocks are part of the solution + # for idx in vertical_blocks: + # model.addConstr(C[idx] == 1) + # logging.info( + # f"constraint4: vertical block ({idx}) - {self.input_blocks[idx].str()}" + # ) ti = time.time() # TODO: include input to decide which objective function to use