diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..37d82163233a011594ec21bd64f7e3e5e26e7c89 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "configs/snakemake_slurm_profile"] + path = configs/snakemake_slurm_profile + url = git@gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/external-projects/snakemake_slurm_profile.git +[submodule "src/utility_cgds"] + path = src/utility_cgds + url = git@gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/utility-images.git diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..31ffe33283fe77ebae857e2f848d50a69869817d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,22 @@ +# Contributing + +## How to contribute? + +Follow the [CGDS' best practices for code +development](http://cgds.uab.edu/manual-of-operations/standards-definitions/#standard-practices) (see sections IDE, +Source Control, Peer Reviews). The [Repo owner](./README.md#Repo-owner) is responsible for reviewing, keeping standards, +and accepting the merge request into `origin/master`. Below is the brief listing of this process: + +- Create new Git branch based off of the relevant branch (`origin/master`). +- Make your changes/additions within the new branch. +- Push branch to GitLab and submit the completed changes/additions via a GitLab merge request. In the merge request + form, choose `MERGE REQUEST TEMAPLTE` in the drop-down for `Title` and then complete merge request form in + `Description` section with appropriate details. +- [Repo owner](./README.md#repo-owners) will review the request and provide feedback, if any. +- When deemed complete, repo owner will merge this branch to master. + + +## Cloning the repository/pipeline + +See [README.md](./README.md#installation). If you are cloning branch other than `master`, use `-b` option to specify the +branch. diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 0000000000000000000000000000000000000000..72d661d3becdfbb8f0cba6341b2c5602a5cf9925 --- /dev/null +++ b/Changelog.md @@ -0,0 +1,19 @@ +# CHANGELOG + +Changelog format to use: + +``` +YYYY-MM-DD John Doe + + * Big Change 1: + <reasoning here> + * Another Change 2: + <reasoning here> +``` +--- + +2021-03-16 Manavalan Gajapathy + + * Creates QuaC workflow, which can run somalier, mosdepth, indexcov, covviz and verifybamid2 + * Uses pedigree file as input + * Makes it production ready. diff --git a/README.md b/README.md index 18ccb3ce5329c62d7b781a79506620d44bc53bb8..927e54ef76dd58b335e4faa77feaa64a139d700a 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,203 @@ +- [QuaC](#quac) + - [Who am I?](#who-am-i) + - [Installation](#installation) + - [Environment Setup](#environment-setup) + - [Requirements](#requirements) + - [Setup config file](#setup-config-file) + - [Prepare verifybamid datasets for exome analysis](#prepare-verifybamid-datasets-for-exome-analysis) + - [Create conda environment](#create-conda-environment) + - [How to run QuaC](#how-to-run-quac) + - [Example usage](#example-usage) + - [Output](#output) + - [Dummy pedigree file creator](#dummy-pedigree-file-creator) + - [Contributing](#contributing) + - [Changelog](#changelog) + # QuaC 🦆🦆 Don't duck that QC thingy 🦆🦆 -## What can I quac about? +## Who am I? + +QuaC is a pipeline developed using snakemake, which runs a set of selected QC tools on NGS samples. Here are the tools +it can currently quack on: + +| Tool | Use | +| ----------------------------------------------------------------- | ---------------------------------------------------------- | +| [somalier](https://github.com/brentp/somalier) | Estimation of sex, ancestry and relatedness | +| [verifybamid](https://github.com/Griffan/VerifyBamID) | Estimates within-species (i.e. cross-sample) contamination | +| [mosdepth](https://github.com/brentp/mosdepth) | Fast BAM/CRAM depth calculation | +| [indexcov](https://github.com/brentp/goleft/tree/master/indexcov) | Estimate coverage from whole-genome bam or cram index | +| [covviz](https://github.com/brwnj/covviz) | Identifies large, coverage-based anomalies | + + +**Note:** At CGDS, significant amount of QC is additionally performed as part of the [small variant caller +pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/small_variant_caller_pipeline/). +This QuaC pipeline should be treated as a companion to the QC results generated by the small_variant_caller_pipeline. + + +## Installation + +Installation simply requires fetching the source code. Following are required: + +- Git +- CGDS GitLab access +- [SSH Key for access](https://docs.uabgrid.uab.edu/wiki/Cheaha_GettingStarted#Logging_in_to_Cheaha) to Cheaha cluster + +To fetch source code, change in to directory of your choice and run: + +```sh +git clone -b master \ + --recurse-submodules \ + git@gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/pipelines/quac.git +``` + +Note that this repository uses git submodules, which gets automatically pulled when cloning using above command. Simply +downloading this repository from GitLab, instead of cloning, may not fetch the submodules included. + +## Environment Setup + +### Requirements + +- Anaconda/miniconda + +Also the tools listed below, which are not available via conda distribution, need to be installed. Static binaries are +available for both these tools and they are hence easy to install. + +- [somalier](https://github.com/brentp/somalier) +- [goleft](https://github.com/brentp/goleft) + +*Note:* CGDS folks using QuaC in cheaha may skip this step, as these tools are already installed and centrally +available. + +### Setup config file + +Workflow config file `configs/workflow.yaml` provides path to path to necessary QC tools as well as other files that +some QC tools require. Modify them as necessary. Refer to the QC tool's documentation for more information on files that +they require. + +#### Prepare verifybamid datasets for exome analysis + +*This step is necessary only for exome samples.* verifybamid has provided auxiliary resource files, which are necessary +for analysis. However, chromosome contigs do not include `chr` prefix in their exome resource files, which are expected +for our analysis. Follow these steps to setup resource files with `chr` prefix in their contig names. + +```sh +# cd into exome resources dir +cd <path-to>/VerifyBamID-2.0.1/resource/exome/ +sed -e 's/^/chr/' 1000g.phase3.10k.b38.exome.vcf.gz.dat.bed > 1000g.phase3.10k.b38_chr.exome.vcf.gz.dat.bed +sed -e 's/^/chr/' 1000g.phase3.10k.b38.exome.vcf.gz.dat.mu > 1000g.phase3.10k.b38_chr.exome.vcf.gz.dat.mu +cp 1000g.phase3.10k.b38.exome.vcf.gz.dat.UD 1000g.phase3.10k.b38_chr.exome.vcf.gz.dat.UD +cp 1000g.phase3.10k.b38.exome.vcf.gz.dat.V 1000g.phase3.10k.b38_chr.exome.vcf.gz.dat.V +``` + +### Create conda environment + +```sh +module reset +module load Anaconda3/2020.02 + +# create conda environment. Needed only the first time. +conda env create --file configs/env/quac.yaml +# activate conda environment +conda activate quac + +# if you need to update the existing environment +conda env update --file configs/env/quac.yaml +``` + +## How to run QuaC + +In the activate conda environment, QuaC is run using script `src/quac.py`. Here are all the options available: + +```sh +$ ./src/run_quac.py -h +usage: run_quac.py [-h] [--project_name] [--projects_path] [--pedigree] + [--outdir] [-m] [--exome] [--cluster_config] [--log_dir] + [-e] [-n] [-l] [--rerun_failed] + +Wrapper tool for QuaC pipeline. + +optional arguments: + -h, --help show this help message and exit + +QuaC workflow options: + --project_name Project name (default: None) + --projects_path Path where all projects are hosted. Dont include + project name here. (default: + /data/project/worthey_lab/projects/) + --pedigree Pedigree filepath. Must correspond to the project + supplied via --project_name (default: None) + --outdir Out directory path (default: + $USER_SCRATCH/tmp/quac/results) + -m , --modules Runs only these user-specified modules(s). If >1, use + comma as delimiter. See QuaC snakemake script for + available options. Useful for development. (default: + all) + --exome Flag to run in exome mode (default: False) + +QuaC wrapper options: + --cluster_config Cluster config json file. Needed for snakemake to run + jobs in cluster. (default: /data/project/worthey_lab/pro + jects/experimental_pipelines/mana/quac/configs/cluster_c + onfig.json) + --log_dir Directory path where logs (both workflow's and + wrapper's) will be stored (default: + $USER_SCRATCH/tmp/quac/results/../logs) + -e , --extra_args Pass additional custom args to snakemake. Equal symbol + is needed for assignment as in this example: -e='-- + forceall' (default: None) + -n, --dryrun Flag to dry-run snakemake. Does not execute anything, + and just display what would be done. Equivalent to '-- + extra_args "-n"' (default: False) + -l, --run_locally Flag to run the snakemake locally and not as a Slurm + job. Useful for testing purposes. (default: False) + --rerun_failed Number of times snakemake restarts failed jobs. This may + be set to >0 to avoid pipeline failing due to job fails + due to random SLURM issues (default: 0) +``` + + +### Example usage + +`project_name` and `pedigree` are required options to run the tool. + +```sh +# for a WGS project +python src/run_quac.py --project_name CF_CFF_PFarrell --pedigree data/raw/ped/CF_CFF_PFarrell.ped + +# for an exome project +python src/run_quac.py --project_name HCC --pedigree data/raw/ped/HCC.ped --exome + +# run for an exome project which is not in the default CGDS projects_path +python src/run_quac.py --project_name UnusualCancers_CMGalluzi \ + --projects_path /data/project/sloss/cgds_path_cmgalluzzi/ \ + --pedigree data/raw/ped/UnusualCancers_CMGalluzi.ped \ + --exome + +# for a WGS project, run specific modules/tools and write results to a dir of choice +python src/run_quac.py --project_name CF_CFF_PFarrell --pedigree data/raw/ped/CF_CFF_PFarrell.ped \ + --modules somalier --outdir /some/lake/with/plenty/ducks/ +``` + +## Output + +QuaC results are stored at path specified via option `--outdir` (default: `$USER_SCRATCH/tmp/quac/results`). This +includes aggregated QC results produced by [multiqc](https://multiqc.info/). + +### Dummy pedigree file creator + +Script `src/create_dummy_ped.py` creates a "dummy" pedigree file given a project path as input. It's purpose is just to +create a basic pedigree file, which will lack sex (unless project tracking sheet is provided), relatedness and +affected info. See header of the script for usage instructions. + +Note that we plan to use phenotips in future to produce fully capable pedigree file. One may manually create them as +well, but this could be error-prone. -* Somalier - * Relatedness - * Sex - * Ancestry -* indexcov - * (Estimated) coverage of smaples +## Contributing +If you like to make changes to the source code, please see the [contribution guidelines](./CONTRIBUTING.md). -## How to quac? +## Changelog -* Modify config file `configs/workflow.yaml` as needed. Note: `projects_path` and `project_name` may be the most - important ones you would care about. -* Pedigree file specific to the project is required. Should be stored as `data/raw/ped/<project_name>.ped`. -* See the header of `workflow/Snakefile` for usage instructions on how to run the workflow +See [here](./Changelog.md). diff --git a/configs/cluster_config.json b/configs/cluster_config.json new file mode 100644 index 0000000000000000000000000000000000000000..faf321d2235f14583fd2ca8c7fe41ed77401e7e0 --- /dev/null +++ b/configs/cluster_config.json @@ -0,0 +1,16 @@ +{ + "__default__": { + "ntasks": 1, + "partition": "express", + "cpus-per-task": "{threads}", + "mem": "8G", + "output": "{RULE_LOGS_PATH}/{rule}-%j.log" + }, + "mosdepth": { + "partition": "short", + "cpus-per-task": 12 + }, + "somalier": { + "cpus-per-task": 8 + } +} diff --git a/configs/env/multiqc.yaml b/configs/env/multiqc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..321e793c25a3834a49fba3531e957aecd046543d --- /dev/null +++ b/configs/env/multiqc.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - anaconda + - bioconda +dependencies: + - python =3.6 + - multiqc=1.9 diff --git a/configs/env/quac.yaml b/configs/env/quac.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a5c0c9b73104f5d6d82b8248c129ce8f8106d20a --- /dev/null +++ b/configs/env/quac.yaml @@ -0,0 +1,15 @@ +name: quac + +channels: + - conda-forge + - bioconda + +dependencies: + - python==3.6.13 + - black==20.8b1 + - pylint==2.7.2 + - bioconda::snakefmt==0.4.0 + - bioconda::snakemake==6.0.5 + - pip + - pip: + - slurmpy==0.0.8 diff --git a/configs/snakemake_slurm_profile b/configs/snakemake_slurm_profile new file mode 160000 index 0000000000000000000000000000000000000000..4ecaf55d398ebfdf8415dff50c26beea0237c34d --- /dev/null +++ b/configs/snakemake_slurm_profile @@ -0,0 +1 @@ +Subproject commit 4ecaf55d398ebfdf8415dff50c26beea0237c34d diff --git a/configs/workflow.yaml b/configs/workflow.yaml index a93400ef320619734300568af541d371dc452d99..1e1b83f640043494263026734fcc593706e2b1de 100644 --- a/configs/workflow.yaml +++ b/configs/workflow.yaml @@ -1,6 +1,3 @@ -projects_path: "/data/project/worthey_lab/projects/" -# project_names: "MuscDyst_SU_MAlexander,CF_CFF_PFarrell,CF_TLOAF_PFarrell,EDS3_unkn_DGreenspan,UDN_Phase1_EAWorthey" -project_names: "CF_CFF_PFarrell" ref: "/data/project/worthey_lab/datasets_central/human_reference_genome/processed/GRCh38/no_alt_rel20190408/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" somalier: tool: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/somalier" @@ -10,5 +7,5 @@ somalier: goleft: tool: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/goleft/0.2.4/goleft" verifyBamID: - svd_dat: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/verifyBamID/VerifyBamID-2.0.1/resource/1000g.phase3.100k.b38.vcf.gz.dat" -logs_path: "logs/rule_logs/" + svd_dat_wgs: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/verifyBamID/VerifyBamID-2.0.1/resource/1000g.phase3.100k.b38.vcf.gz.dat" + svd_dat_exome: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/verifyBamID/VerifyBamID-2.0.1/resource/exome/1000g.phase3.10k.b38_chr.exome.vcf.gz.dat" diff --git a/src/create_dummy_ped.py b/src/create_dummy_ped.py index 27981777b30bd24b9adfafdcc3cdba267790c54e..8c2883b5709d86fc60a040becb5984e4b82a5996 100644 --- a/src/create_dummy_ped.py +++ b/src/create_dummy_ped.py @@ -2,17 +2,24 @@ Create dummy ped file by project Usage: +# setup environment ml reset ml Anaconda3 conda activate quac_common -python src/create_dummy_ped.py + +# Example +python src/create_dummy_ped.py --project_path "/data/project/worthey_lab/projects/CF_CFF_PFarrell/" --outfile test.ped """ from pathlib import Path import pandas as pd +import fire def read_project_tracker(project_tracker_f): + """ + Reads project tracking excel file. Expects certain columns to be present. + """ df = pd.read_excel(project_tracker_f, usecols=["CGDS ID", "Sex"]) @@ -24,71 +31,51 @@ def read_project_tracker(project_tracker_f): return sample_sex_dict -def nbbbb(): +def main(project_path, outfile, tracking_sheet=False): + """ + Creates dummy pedigree file for the project requested + + Args: + project_path (str): Project path. Script will look for samples under its subdirectory "analysis". + outfile (str): Output pedigree file path + tracking_sheet (str, optional): Project tracking sheet in excel format. Uses this for sex info. Defaults to False. + """ - project_path = Path("/data/project/worthey_lab/projects") / project_name / "analysis" + # get sample's sex info from project tracking sheet, if supplied + if tracking_sheet: + sample_sex_dict = read_project_tracker(tracking_sheet) + + # get samples from cheaha for the project + project_path = Path(project_path) / "analysis" samples = ( f.name for f in project_path.iterdir() if f.is_dir() and f.name.startswith(("LW", "UDN")) ) header = ["#family_id", "sample_id", "paternal_id", "maternal_id", "sex", "phenotype"] - with open(Path(outpath) / f"{project_name}.ped", "w") as out_handle: + with open(outfile, "w") as out_handle: out_handle.write("\t".join(header) + "\n") for sample in sorted(samples): - data = ["unknown", sample, "-9", "-9", "-9", "-9"] + data = [ + "unknown", + sample, + "-9", # father + "-9", # mother + sample_sex_dict[sample] if tracking_sheet else "-9", # sample sex + "-9", # affected + ] out_handle.write("\t".join(data) + "\n") return None -def main(outpath): - - project_dict = { - "CF_CFF_PFarrell": { - "tracking_sheet": "data/external/project_tracker/PROJECT TRACKING -CF.xlsx", - "affected": "all", - }, - "CF_TLOAF_PFarrell": { - "tracking_sheet": "data/external/project_tracker/PROJECT TRACKING -CF.xlsx", - "affected": "all", - }, - # "CF_TLOAF_PFarrell", - # "EDS3_unkn_DGreenspan", - # "MuscDyst_SU_MAlexander", - # "UDN_Phase1_EAWorthey", - } - - for project_name in project_dict: - # get sample's sex info from project tracking sheet - sample_sex_dict = read_project_tracker(project_dict[project_name]["tracking_sheet"]) - - # get samples from cheaha for the project - project_path = Path("/data/project/worthey_lab/projects") / project_name / "analysis" - samples = ( - f.name - for f in project_path.iterdir() - if f.is_dir() and f.name.startswith(("LW", "UDN")) - ) - - header = ["#family_id", "sample_id", "paternal_id", "maternal_id", "sex", "phenotype"] - with open(Path(outpath) / f"{project_name}.ped", "w") as out_handle: - out_handle.write("\t".join(header) + "\n") - - for sample in sorted(samples): - data = [ - "unknown", - sample, - "-9", # father - "-9", # mother - sample_sex_dict[sample], # sample sex - "1" if project_dict[project_name]["affected"] == "all" else "-9", # affected - ] - out_handle.write("\t".join(data) + "\n") - - return None - - if __name__ == "__main__": - OUT_PATH = "data/raw/ped" # not so raw, is it? - main(OUT_PATH) + FIRE_MODE = True + # FIRE_MODE = False + + if FIRE_MODE: + fire.Fire(main) + else: + PROJECT_PATH = "/data/project/worthey_lab/projects/CF_CFF_PFarrell/" + OUTFILE = "out.ped" + main(PROJECT_PATH, OUTFILE) diff --git a/src/run_pipeline.sh b/src/run_pipeline.sh deleted file mode 100755 index 8addf3ed16f7621a1af4fad4d78d82a4a791bf88..0000000000000000000000000000000000000000 --- a/src/run_pipeline.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash -#SBATCH --job-name=quac -#SBATCH --output=logs/quac-%j.log -#SBATCH --cpus-per-task=20 -#SBATCH --mem-per-cpu=8G -#SBATCH --partition=short - -set -euo pipefail - -module reset -module load Anaconda3/2020.02 -module load snakemake/5.9.1-foss-2018b-Python-3.6.6 - -snakemake -s workflow/Snakefile --use-conda -k -p diff --git a/src/run_quac.py b/src/run_quac.py new file mode 100755 index 0000000000000000000000000000000000000000..a55da080e4b09132c571c269fc403a014909f2c3 --- /dev/null +++ b/src/run_quac.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 + +""" +QuaC pipeline. + +Reads user input data, constructs snakemake command to run the pipeline +along with their required modules, and submits them as slurm job. + +Run the script with --help flag to see its available options. + +Example usage: +# First create environment as described in documentation +python src/run_quac.py --project_name CF_CFF_PFarrell --pedigree data/raw/ped/CF_CFF_PFarrell.ped +python src/run_quac.py --project_name HCC --pedigree data/raw/ped/HCC.ped --exome +python src/run_quac.py --project_name UnusualCancers_CMGalluzi --projects_path /data/project/sloss/cgds_path_cmgalluzzi/ \ + --pedigree data/raw/ped/UnusualCancers_CMGalluzi.ped --exome +""" + +import argparse +from pathlib import Path +import os.path +import yaml +from utility_cgds.cgds.pipeline.src.submit_slurm_job import submit_slurm_job + + +def create_snakemake_command(args): + """ + Construct snakemake command to run the pipeline + """ + + # slurm profile dir for snakemake to properly handle to cluster job fails + repo_path = Path(__file__).absolute().parents[1] + snakemake_profile_dir = ( + repo_path / "configs/snakemake_slurm_profile//{{cookiecutter.profile_name}}/" + ) + + # use absolute path to run it from anywhere + snakefile_path = repo_path / "workflow" / "Snakefile" + + quac_configs = { + "modules": args.modules, + "project_name": args.project_name, + "projects_path": args.projects_path, + "ped": args.pedigree, + "out_dir": str(Path(args.outdir) / args.project_name), + "log_dir": args.log_dir, + "exome": args.exome, + } + quac_configs = " ".join([f"{k}='{v}'" for k, v in quac_configs.items()]) + + # snakemake command to run + cmd = [ + "snakemake", + f"--snakefile '{snakefile_path}'", + f"--config {quac_configs}", + f"--restart-times {args.rerun_failed}", + "--use-conda", + f"--profile '{snakemake_profile_dir}'", + f"--cluster-config '{args.cluster_config}'", + "--cluster 'sbatch --ntasks {cluster.ntasks} --partition {cluster.partition}" + " --cpus-per-task {cluster.cpus-per-task} --mem {cluster.mem}" + " --output {cluster.output} --parsable'", + ] + + # add any user provided extra args for snakemake + if args.extra_args: + cmd += [args.extra_args] + + # adds option for dryrun if requested + if args.dryrun: + cmd += ["--dryrun"] + + return cmd + + +def main(args): + + # get snakemake command to execute for the pipeline + snakemake_cmd = create_snakemake_command(args) + + # put together pipeline command to be run + pipeline_cmd = "\n".join( + [ + " \\\n\t".join(snakemake_cmd), + ] + ) + + print( + f'{"#" * 40}\n' + "Command to run the pipeline:\n" + "\x1B[31;95m" + pipeline_cmd + "\x1B[0m\n" + f'{"#" * 40}\n' + ) + + # submit snakemake command as a slurm job + slurm_resources = { + "partition": "short", # express(max 2 hrs), short(max 12 hrs), medium(max 50 hrs), long(max 150 hrs) + "ntasks": "1", + "time": "12:00:00", + "cpus-per-task": "1", + "mem": "8G", + } + + job_dict = { + "basename": "quac-", + "log_dir": args.log_dir, + "run_locally": args.run_locally, + "resources": slurm_resources, + } + + submit_slurm_job(pipeline_cmd, job_dict) + + return None + + +def is_valid_file(p, arg): + if not Path(arg).is_file(): + p.error("The file '%s' does not exist!" % arg) + else: + return arg + + +def is_valid_dir(p, arg): + if not Path(os.path.expandvars(arg)).is_dir(): + p.error("The directory '%s' does not exist!" % arg) + else: + return os.path.expandvars(arg) + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description="Wrapper tool for QuaC pipeline.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + ############ Args for QuaC workflow ############ + WORKFLOW = PARSER.add_argument_group("QuaC workflow options") + + WORKFLOW.add_argument( + "--project_name", + help="Project name", + metavar="", + ) + + PROJECT_PATH_DEFAULT = "/data/project/worthey_lab/projects/" + WORKFLOW.add_argument( + "--projects_path", + help="Path where all projects are hosted. Don't include project name here.", + default=PROJECT_PATH_DEFAULT, + type=lambda x: is_valid_dir(PARSER, x), + metavar="", + ) + WORKFLOW.add_argument( + "--pedigree", + help="Pedigree filepath. Must correspond to the project supplied via --project_name", + type=lambda x: is_valid_file(PARSER, x), + metavar="", + ) + QUAC_OUTDIR_DEFAULT = "$USER_SCRATCH/tmp/quac/results" + WORKFLOW.add_argument( + "--outdir", + help="Out directory path", + default=QUAC_OUTDIR_DEFAULT, + type=lambda x: is_valid_dir(PARSER, x), + metavar="", + ) + WORKFLOW.add_argument( + "-m", + "--modules", + help="Runs only these user-specified modules(s). If >1, use comma as delimiter. \ + See QuaC snakemake script for available options. Useful for development.", + default="all", + metavar="", + ) + WORKFLOW.add_argument( + "--exome", + action="store_true", + help="Flag to run in exome mode", + ) + + ############ Args for QuaC wrapper tool ############ + WRAPPER = PARSER.add_argument_group("QuaC wrapper options") + + CLUSTER_CONFIG_DEFAULT = Path(__file__).absolute().parents[1] / "configs/cluster_config.json" + WRAPPER.add_argument( + "--cluster_config", + help="Cluster config json file. Needed for snakemake to run jobs in cluster.", + default=CLUSTER_CONFIG_DEFAULT, + type=lambda x: is_valid_file(PARSER, x), + metavar="", + ) + + LOGS_DIR_DEFAULT = f"{QUAC_OUTDIR_DEFAULT}/../logs" + WRAPPER.add_argument( + "--log_dir", + help="Directory path where logs (both workflow's and wrapper's) will be stored", + default=LOGS_DIR_DEFAULT, + type=lambda x: is_valid_dir(PARSER, x), + metavar="", + ) + WRAPPER.add_argument( + "-e", + "--extra_args", + help="Pass additional custom args to snakemake. Equal symbol is needed " + "for assignment as in this example: -e='--forceall'", + metavar="", + ) + WRAPPER.add_argument( + "-n", + "--dryrun", + action="store_true", + help="Flag to dry-run snakemake. Does not execute anything, and " + "just display what would be done. Equivalent to '--extra_args \"-n\"'", + ) + WRAPPER.add_argument( + "-l", + "--run_locally", + action="store_true", + help="Flag to run the snakemake locally and not as a Slurm job. " + "Useful for testing purposes.", + ) + RERUN_FAILED_DEFAULT = 0 + WRAPPER.add_argument( + "--rerun_failed", + help=f"Number of times snakemake restarts failed jobs. This may be set to >0 " + "to avoid pipeline failing due to job fails due to random SLURM issues", + default=RERUN_FAILED_DEFAULT, + metavar="", + ) + + ARGS = PARSER.parse_args() + + main(ARGS) diff --git a/src/utility_cgds b/src/utility_cgds new file mode 160000 index 0000000000000000000000000000000000000000..5145e05d34d694a676524d02d78ee21eae8ce64e --- /dev/null +++ b/src/utility_cgds @@ -0,0 +1 @@ +Subproject commit 5145e05d34d694a676524d02d78ee21eae8ce64e diff --git a/workflow/Snakefile b/workflow/Snakefile index 18d4272efc74a6c92576d202f1c80c7799da26f4..0d582651489c26e13648b944f67fd64a190bd55c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,15 +1,12 @@ -''' -USAGE: -module reset -module load Anaconda3/2020.02 -module load snakemake/5.9.1-foss-2018b-Python-3.6.6 - -snakemake -s workflow/Snakefile --use-conda -k -p --config modules=all -''' -from pathlib import Path +""" +QuaC pipeline +""" + import pandas as pd WORKFLOW_PATH = Path(workflow.basedir).parent + + configfile: WORKFLOW_PATH / "configs/workflow.yaml" @@ -18,13 +15,18 @@ include: "rules/common.smk" include: "rules/relatedness_ancestry.smk" include: "rules/coverage_analysis.smk" include: "rules/within_species_contamintation.smk" +include: "rules/aggregate_results.smk" + + ######################################### ############ CONSTRAINTS ############ wildcard_constraints: - sample = '|'.join([item for sublist in SAMPLES.values() for item in sublist]), - project = '|'.join(PROJECT_NAMES), + sample="|".join(SAMPLES), + project=PROJECT_NAME, + + ######################################### @@ -32,5 +34,5 @@ rule all: input: TARGETS_SOMALIER, TARGETS_COVERAGE, - TARGETS_CONTAMINATION - + TARGETS_CONTAMINATION, + OUT_DIR / "multiqc/multiqc_report.html", diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk new file mode 100644 index 0000000000000000000000000000000000000000..f952848c08331b99a684244b726e549c56131514 --- /dev/null +++ b/workflow/rules/aggregate_results.smk @@ -0,0 +1,23 @@ +rule multiqc: + input: + TARGETS_SOMALIER, + TARGETS_COVERAGE, + TARGETS_CONTAMINATION, + output: + OUT_DIR / "multiqc/multiqc_report.html", + message: + "Aggregates QC results using multiqc." + conda: + str(WORKFLOW_PATH / "configs/env/multiqc.yaml") + params: + out_dir=lambda wildcards, output: str(Path(output[0]).parent), + in_dirs=OUT_DIR, + shell: + r""" + unset PYTHONPATH + multiqc \ + --cl_config "max_table_rows: 2000" \ + --force \ + -o "{params.out_dir}" \ + {params.in_dirs} + """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b2dd404c965af190524a54db43a5a3a490c8c1fd..e809ff9e2eab1e497f4935f6383ab640a84f9569 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,73 +1,71 @@ def get_samples(ped_fpath): - + """ + Parse pedigree file and return sample names + """ samples = () - with open(ped_fpath, 'r') as f_handle: + with open(ped_fpath, "r") as f_handle: for line in f_handle: - if line.startswith('#'): + if line.startswith("#"): continue - sample = line.split('\t')[1] + sample = line.split("\t")[1] samples += (sample,) return samples -def modules_to_run(chosen, allowed_options=['somalier', 'verifybamid', 'indexcov', 'mosdepth', 'covviz', 'all']): +def modules_to_run(user_input): + """ + Parse user-selected tools. Verify they are among the expected values. + """ + user_input = set([x.strip().lower() for x in user_input.strip().split(",")]) - selected_modules = set([x.strip().lower() for x in chosen.strip().split(',')]) - if selected_modules.difference(set(allowed_options)): - msg =f"ERROR: Unexpected module was supplied by user. Allowed options: {allowed_options}" + allowed_options = ["somalier", "verifybamid", "indexcov", "mosdepth", "covviz", "all"] + if user_input.difference(set(allowed_options)): + msg = f"ERROR: Unexpected module was supplied by user. Allowed options: {allowed_options}" raise SystemExit(msg) - return selected_modules - - -EXTERNAL_DIR = Path("data/external") -RAW_DIR = Path("data/raw") -INTERIM_DIR = Path("data/interim") -PROCESSED_DIR = Path("data/processed") - -LOGS_PATH = Path(config['logs_path']) -LOGS_PATH.mkdir(parents=True, exist_ok=True) + print(f"Tools chosen by user to run: {list(user_input)}") -PROJECTS_PATH = Path(config['projects_path']) -PROJECT_NAMES = config['project_names'].split(',') -SAMPLES = {project: get_samples(RAW_DIR / f"ped/{project}.ped") for project in PROJECT_NAMES} + return user_input -MODULES_TO_RUN = modules_to_run(config['modules']) +def get_targets(tool_name, samples=None): + """ + returns target files based on the tool + """ + flist = [] + if tool_name == "somalier": + flist += [ + OUT_DIR / "somalier" / "relatedness" / "somalier.html", + OUT_DIR / "somalier" / "ancestry" / "somalier.somalier-ancestry.html", + ] + elif tool_name == "indexcov": + flist += [OUT_DIR / "indexcov" / "index.html"] + elif tool_name == "covviz": + flist += [OUT_DIR / "covviz/" / "covviz_report.html"] + elif tool_name == "mosdepth": + flist += [OUT_DIR / "mosdepth" / f"mosdepth.html"] + flist += ( + expand( + str(OUT_DIR / "mosdepth" / "results" / "{sample}.mosdepth.global.dist.txt"), sample=samples + ), + ) + elif tool_name == "verifybamid": + flist += (expand(str(OUT_DIR / "verifyBamID" / "{sample}.Ancestry"), sample=samples),) -def get_targets(tool_name, projects=PROJECT_NAMES, sample_dict=SAMPLES): + return flist - flist = [] - for project in projects: - for sample in sample_dict[project]: - if tool_name == 'mosdepth': - f = INTERIM_DIR / "mosdepth" / project / f"{sample}.mosdepth.global.dist.txt" - flist.append(f) - elif tool_name == 'verifybamid': - f = PROCESSED_DIR / "verifyBamID" / project / f"{sample}.Ancestry" - flist.append(f) - if tool_name == 'somalier': - f = [ - expand(str(PROCESSED_DIR / "somalier/{project}/relatedness/somalier.html"), - project=PROJECT_NAMES), - expand(str(PROCESSED_DIR / "somalier/{project}/ancestry/somalier.somalier-ancestry.html"), - project=PROJECT_NAMES), - ] - flist.append(f) - elif tool_name == 'indexcov': - f = expand(str(PROCESSED_DIR / "indexcov/{project}/index.html"), - project=PROJECT_NAMES) - flist.append(f) - elif tool_name == 'covviz': - f = expand(str(PROCESSED_DIR / "covviz/{project}/covviz_report.html"), - project=PROJECT_NAMES), - flist.append(f) - elif tool_name == 'mosdepth': - f = expand(str(PROCESSED_DIR / "mosdepth/{project}/mosdepth_{project}.html"), - project=PROJECT_NAMES), - flist.append(f) +#### configs from cli #### +OUT_DIR = Path(config["out_dir"]) +PROJECT_NAME = config["project_name"] +PROJECTS_PATH = Path(config["projects_path"]) +MODULES_TO_RUN = modules_to_run(config["modules"]) +PEDIGREE_FPATH = config["ped"] +EXOME_MODE = config["exome"] +#### configs from configfile #### +RULE_LOGS_PATH = Path(config["log_dir"]) / "rule_logs" +RULE_LOGS_PATH.mkdir(parents=True, exist_ok=True) - return flist +SAMPLES = get_samples(PEDIGREE_FPATH) diff --git a/workflow/rules/coverage_analysis.smk b/workflow/rules/coverage_analysis.smk index 81d2f2e025d8712d1d4b8580dd3dcef4c370e98c..386b1567ddd858c851955680162e0aa12602cc63 100644 --- a/workflow/rules/coverage_analysis.smk +++ b/workflow/rules/coverage_analysis.smk @@ -1,29 +1,30 @@ TARGETS_COVERAGE = [ # indexcov - get_targets('indexcov') if {'all', 'indexcov'}.intersection(MODULES_TO_RUN) else [], + get_targets("indexcov") if {"all", "indexcov"}.intersection(MODULES_TO_RUN) and not EXOME_MODE else [], # covviz - get_targets('covviz') if {'all', 'covviz'}.intersection(MODULES_TO_RUN) else [], + get_targets("covviz") if {"all", "covviz"}.intersection(MODULES_TO_RUN) and not EXOME_MODE else [], # mosdepth - get_targets('mosdepth') if {'all', 'mosdepth'}.intersection(MODULES_TO_RUN) else [], + get_targets("mosdepth", SAMPLES) if {"all", "mosdepth"}.intersection(MODULES_TO_RUN) else [], ] ########################## Mosdepth ########################## rule mosdepth_coverage: input: - bam = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "bam" / "{sample}.bam", - bam_index = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "bam" / "{sample}.bam.bai", + bam=PROJECTS_PATH / PROJECT_NAME / "analysis" / "{sample}" / "bam" / "{sample}.bam", + bam_index=PROJECTS_PATH / PROJECT_NAME / "analysis" / "{sample}" / "bam" / "{sample}.bam.bai", output: - dist = INTERIM_DIR / "mosdepth/{project}/{sample}.mosdepth.global.dist.txt", - summary = INTERIM_DIR / "mosdepth/{project}/{sample}.mosdepth.summary.txt", - log: - LOGS_PATH / "{project}/mosdepth_coverage-{sample}.log" + dist=OUT_DIR / "mosdepth" / "results" / "{sample}.mosdepth.global.dist.txt", + summary=OUT_DIR / "mosdepth" / "results" / "{sample}.mosdepth.summary.txt", message: - "Running mosdepth for coverage. Project: {wildcards.project}, sample: {wildcards.sample}" + "Running mosdepth for coverage. Sample: {wildcards.sample}" + group: + "mosdepth" conda: str(WORKFLOW_PATH / "configs/env/mosdepth.yaml") + threads: 4 params: - out_prefix = lambda wildcards, output: output['summary'].replace('.mosdepth.summary.txt', ''), + out_prefix=lambda wildcards, output: output["summary"].replace(".mosdepth.summary.txt", ""), shell: r""" mosdepth \ @@ -31,55 +32,57 @@ rule mosdepth_coverage: --threads {threads} \ --fast-mode \ {params.out_prefix} \ - {input.bam} \ - > {log} 2>&1 + {input.bam} """ rule mosdepth_plot: input: - dist = lambda wildcards: expand(str(INTERIM_DIR / "mosdepth" / wildcards.project / "{sample}.mosdepth.global.dist.txt"), - sample=SAMPLES[wildcards.project]), - script = WORKFLOW_PATH / "src/mosdepth/v0.3.1/plot-dist.py", + dist=expand( + str(OUT_DIR / "mosdepth" / "results" / "{sample}.mosdepth.global.dist.txt"), sample=SAMPLES + ), + script=WORKFLOW_PATH / "src/mosdepth/v0.3.1/plot-dist.py", output: - PROCESSED_DIR / "mosdepth/{project}/mosdepth_{project}.html", - log: - LOGS_PATH / "{project}/mosdepth_plot.log" + OUT_DIR / "mosdepth" / f"mosdepth.html", message: - "Running mosdepth plotting. Project: {wildcards.project}" + "Running mosdepth plotting" + group: + "mosdepth" params: - in_dir = lambda wildcards, input: Path(input[0]).parent, - workflow_dir = Path(workflow.basedir).parent + in_dir=lambda wildcards, input: Path(input[0]).parent, shell: r""" echo "Heads up: Mosdepth-plotting is run on all samples in "{params.in_dir}"; Not just the files mentioned in the rule's input." cd {params.in_dir} # if not in directory, mosdepth uses filepath as sample name :( python {input.script} \ - --output {params.workflow_dir}/{output} \ - *.mosdepth.global.dist.txt \ - > {params.workflow_dir}/{log} 2>&1 + --output {output} \ + *.mosdepth.global.dist.txt """ ########################## indexcov ########################## rule indexcov: input: - bam = lambda wildcards: expand(str(PROJECTS_PATH / wildcards.project / "analysis" / "{sample}" / "bam" / "{sample}.bam"), - sample=SAMPLES[wildcards.project]), - bam_index = lambda wildcards: expand(str(PROJECTS_PATH / wildcards.project / "analysis" / "{sample}" / "bam" / "{sample}.bam.bai"), - sample=SAMPLES[wildcards.project]), - goleft_tool = config['goleft']['tool'], + bam=expand( + str(PROJECTS_PATH / PROJECT_NAME / "analysis" / "{sample}" / "bam" / "{sample}.bam"), + sample=SAMPLES, + ), + bam_index=expand( + str(PROJECTS_PATH / PROJECT_NAME / "analysis" / "{sample}" / "bam" / "{sample}.bam.bai"), + sample=SAMPLES, + ), + goleft_tool=config["goleft"]["tool"], output: - PROCESSED_DIR / "indexcov/{project}/index.html", - PROCESSED_DIR / "indexcov/{project}/{project}-indexcov.bed.gz", - log: - LOGS_PATH / "{project}/indexcov.log" + html=OUT_DIR / "indexcov" / "index.html", + bed=OUT_DIR / "indexcov" / f"indexcov-indexcov.bed.gz", message: - "Running indexcov. Project: {wildcards.project}" + "Running indexcov" + log: + OUT_DIR / "indexcov" / "stdout.log", params: - outdir = lambda wildcards, output: Path(output[0]).parent, - project_dir = lambda wildcards, input: str(Path(input['bam'][0]).parents[2]), + outdir=lambda wildcards, output: Path(output[0]).parent, + project_dir=lambda wildcards, input: str(Path(input["bam"][0]).parents[2]), shell: r""" echo "Heads up: Indexcov is run on all samples in the "project directory"; Not just the files mentioned in the rule's input." @@ -94,21 +97,21 @@ rule indexcov: ########################## covviz ########################## rule covviz: input: - bed = PROCESSED_DIR / "indexcov/{project}/{project}-indexcov.bed.gz", - ped = RAW_DIR / "ped" / "{project}.ped", + bed=OUT_DIR / "indexcov" / f"indexcov-indexcov.bed.gz", + ped=PEDIGREE_FPATH, output: - PROCESSED_DIR / "covviz/{project}/covviz_report.html", - log: - LOGS_PATH / "{project}/covviz.log" + html=OUT_DIR / "covviz" / "covviz_report.html", message: - "Running covviz. Project: {wildcards.project}" + "Running covviz" + log: + OUT_DIR / "covviz" / "stdout.log", conda: str(WORKFLOW_PATH / "configs/env/covviz.yaml") shell: r""" covviz \ --ped {input.ped} \ - --output {output} \ + --output {output.html} \ {input.bed} \ > {log} 2>&1 """ diff --git a/workflow/rules/relatedness_ancestry.smk b/workflow/rules/relatedness_ancestry.smk index 8fe64365889f549ba1052ddfc20ad7e0093bb56f..ee2cb99985b6e1cbccbff4e4754b30291e69faed 100644 --- a/workflow/rules/relatedness_ancestry.smk +++ b/workflow/rules/relatedness_ancestry.smk @@ -1,49 +1,52 @@ TARGETS_SOMALIER = [ - get_targets('somalier') if {'all', 'somalier'}.intersection(MODULES_TO_RUN) else [], + get_targets("somalier") if {"all", "somalier"}.intersection(MODULES_TO_RUN) else [], ] + rule somalier_extract: input: - bam = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "bam" / "{sample}.bam", - bam_index = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "bam" / "{sample}.bam.bai", - somalier_tool = config['somalier']['tool'], - sites = config['somalier']['sites'], - ref_genome = config['ref'], + bam=PROJECTS_PATH / PROJECT_NAME / "analysis" / "{sample}" / "bam" / "{sample}.bam", + bam_index=PROJECTS_PATH / PROJECT_NAME / "analysis" / "{sample}" / "bam" / "{sample}.bam.bai", + somalier_tool=config["somalier"]["tool"], + sites=config["somalier"]["sites"], + ref_genome=config["ref"], output: - INTERIM_DIR / "somalier_extract/{project}/{sample}.somalier" - log: - LOGS_PATH / "{project}/somalier_extract-{sample}.log" + OUT_DIR / "somalier/extract/{sample}.somalier", message: - "Running somalier extract. Project: {wildcards.project}" + "Running somalier extract. Sample: {wildcards.sample}" + group: + "somalier" params: - outdir = lambda wildcards, output: Path(output[0]).parent, + outdir=lambda wildcards, output: Path(output[0]).parent, shell: r""" {input.somalier_tool} extract \ --sites {input.sites} \ --fasta {input.ref_genome} \ --out-dir {params.outdir} \ - {input.bam} \ - > {log} 2>&1 + {input.bam} """ rule somalier_relate: input: - extracted = lambda wildcards: expand(str(INTERIM_DIR / "somalier_extract" / wildcards.project / "{sample}.somalier"), - sample=SAMPLES[wildcards.project]), - ped = RAW_DIR / "ped" / "{project}.ped", - somalier_tool = config['somalier']['tool'], + extracted=expand(str(OUT_DIR / "somalier" / "extract" / "{sample}.somalier"), sample=SAMPLES), + ped=PEDIGREE_FPATH, + somalier_tool=config["somalier"]["tool"], output: - expand(str(PROCESSED_DIR / "somalier/{{project}}/relatedness/somalier.{ext}"), - ext=['html', 'groups.tsv', 'pairs.tsv', 'samples.tsv']), - log: - LOGS_PATH / "{project}/somalier_relate.log" + out=expand( + str(OUT_DIR / "somalier" / "relatedness" / "somalier.{ext}"), + ext=["html", "pairs.tsv", "samples.tsv"], + ), message: - "Running somalier relate. Project: {wildcards.project}" + "Running somalier relate" + log: + log=OUT_DIR / "somalier" / "relatedness" / "somalier.log", + group: + "somalier" params: - outdir = lambda wildcards, output: Path(output[0]).parent, - indir = lambda wildcards, input: Path(input[0]).parent, + outdir=lambda wildcards, output: Path(output["out"][0]).parent, + indir=lambda wildcards, input: Path(input[0]).parent, shell: r""" echo "Heads up: Somalier is run on all samples in the input directory; Not just the files mentioned in the rule's input." @@ -59,21 +62,24 @@ rule somalier_relate: rule somalier_ancestry: input: - extracted = lambda wildcards: expand(str(INTERIM_DIR / "somalier_extract" / wildcards.project / "{sample}.somalier"), - sample=SAMPLES[wildcards.project]), - somalier_tool = config['somalier']['tool'], - labels_1kg = config['somalier']['labels_1kg'], - somalier_1kg = directory(config['somalier']['somalier_1kg']), - log: - LOGS_PATH / "{project}/somalier_ancestry.log" + extracted=expand(str(OUT_DIR / "somalier" / "extract" / "{sample}.somalier"), sample=SAMPLES), + somalier_tool=config["somalier"]["tool"], + labels_1kg=config["somalier"]["labels_1kg"], + somalier_1kg=directory(config["somalier"]["somalier_1kg"]), output: - expand(str(PROCESSED_DIR / "somalier/{{project}}/ancestry/somalier.somalier-ancestry.{ext}"), - ext=['html', 'tsv']), + out=expand( + str(OUT_DIR / "somalier" / "ancestry" / "somalier.somalier-ancestry.{ext}"), + ext=["html", "tsv"], + ), message: - "Running somalier ancestry. Project: {wildcards.project}" + "Running somalier ancestry." + log: + log=OUT_DIR / "somalier" / "ancestry" / "somalier.log", + group: + "somalier" params: - outdir = lambda wildcards, output: Path(output[0]).parent, - indir = lambda wildcards, input: Path(input[0]).parent, + outdir=lambda wildcards, output: Path(output["out"][0]).parent, + indir=lambda wildcards, input: Path(input[0]).parent, shell: r""" echo "Heads up: Somalier is run on all samples in the input directory; Not just the files mentioned in the rule's input." @@ -81,7 +87,7 @@ rule somalier_ancestry: {input.somalier_tool} ancestry \ --output-prefix {params.outdir}/somalier \ --labels {input.labels_1kg} \ - {input.somalier_1kg}*.somalier ++ \ + {input.somalier_1kg}/*.somalier ++ \ {params.indir}/*.somalier \ > {log} 2>&1 """ diff --git a/workflow/rules/within_species_contamintation.smk b/workflow/rules/within_species_contamintation.smk index 78ced8fbd95d95d56b6731d88a3f84527e80c604..06eabad86e387644e8f0ab8a12cddaa67ba3e080 100644 --- a/workflow/rules/within_species_contamintation.smk +++ b/workflow/rules/within_species_contamintation.smk @@ -1,33 +1,38 @@ TARGETS_CONTAMINATION = [ - get_targets('verifybamid') if {'all', 'verifybamid'}.intersection(MODULES_TO_RUN) else [], + get_targets("verifybamid", SAMPLES) if {"all", "verifybamid"}.intersection(MODULES_TO_RUN) else [], ] +def get_svd(wildcards): + if EXOME_MODE: + return expand(f"{config['verifyBamID']['svd_dat_exome']}.{{ext}}", ext=["bed", "mu", "UD"]) + else: + return expand(f"{config['verifyBamID']['svd_dat_wgs']}.{{ext}}", ext=["bed", "mu", "UD"]) + + rule verifybamid: input: - bam = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "bam" / "{sample}.bam", - bam_index = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "bam" / "{sample}.bam.bai", - ref_genome = config['ref'], - svd = expand(f"{config['verifyBamID']['svd_dat']}.{{ext}}", - ext=['bed', 'mu', 'UD']) + bam=PROJECTS_PATH / PROJECT_NAME / "analysis" / "{sample}" / "bam" / "{sample}.bam", + bam_index=PROJECTS_PATH / PROJECT_NAME / "analysis" / "{sample}" / "bam" / "{sample}.bam.bai", + ref_genome=config["ref"], + svd=get_svd, output: - ancestry = PROCESSED_DIR / "verifyBamID/{project}/{sample}.Ancestry", - selfsm = PROCESSED_DIR / "verifyBamID/{project}/{sample}.selfSM", - log: - LOGS_PATH / "{project}/verifyBamID-{sample}.log" + ancestry=OUT_DIR / "verifyBamID/{sample}.Ancestry", + selfsm=OUT_DIR / "verifyBamID/{sample}.selfSM", message: - "Running VerifyBamID to detect within-species contamination. Project: {wildcards.project}, sample: {wildcards.sample}" + "Running VerifyBamID to detect within-species contamination. sample: {wildcards.sample}" conda: str(WORKFLOW_PATH / "configs/env/verifyBamID.yaml") params: - svd_prefix = lambda wildcards, input: input['svd'][0].replace(Path(input['svd'][0]).suffix, ''), - out_prefix = lambda wildcards, output: output['ancestry'].replace('.Ancestry', ''), + svd_prefix=lambda wildcards, input: input["svd"][0].replace(Path(input["svd"][0]).suffix, ""), + out_prefix=lambda wildcards, output: output["ancestry"].replace(".Ancestry", ""), + threads: 4 shell: r""" verifybamid2 \ + --NumThread {threads} \ --SVDPrefix {params.svd_prefix} \ --Reference {input.ref_genome} \ --BamFile {input.bam} \ - --Output {params.out_prefix} \ - > {log} 2>&1 + --Output {params.out_prefix} """