diff --git a/.gitignore b/.gitignore index 5df7defe0011c8019c0bb736d730d6e843a20a94..d5cdec77a6a3d0d2c9d7f51665fa44b82ddbb885 100644 --- a/.gitignore +++ b/.gitignore @@ -72,17 +72,11 @@ target/ .ipynb_checkpoints/ # exclude data from source control by default -# data/ -variant_annotation/data/ +data/ #snakemake .snakemake/ - -# exclude test data used for development -to_be_deleted/test_data/data/ref -to_be_deleted/test_data/data/reads - #logs logs/ diff --git a/configs/workflow.yaml b/configs/workflow.yaml index 22d858b41808c06f0be2e650e3145161c747ce3b..c846ab9a0f774d2480b0e2481d98fa73cdf7490a 100644 --- a/configs/workflow.yaml +++ b/configs/workflow.yaml @@ -1,6 +1,6 @@ -- somalier: +somalier: tool: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/somalier" sites: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/sites/sites.hg38.vcf.gz" labels_1kg: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/ancestry/ancestry-labels-1kg.tsv" - somalier_1kg: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/ancestry/somalier_1kg/*.somalier" + somalier_1kg: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/ancestry/1kg-somalier/" ref: "/data/project/worthey_lab/datasets_central/human_reference_genome/processed/GRCh38/no_alt_rel20190408/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" diff --git a/workflow/Snakefile b/workflow/Snakefile index f7cae27e793fd4b43cdea68865d92b73fd2fd003..d5935790d376c35c16f497d587f712a798ae91b0 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -7,18 +7,51 @@ module load snakemake/5.9.1-foss-2018b-Python-3.6.6 snakemake -s workflow/Snakefile --use-conda -p ''' from pathlib import Path +import pandas as pd WORKFLOW_PATH = Path(workflow.basedir).parent - configfile: WORKFLOW_PATH / "configs/workflow.yaml" + +def get_samples(fpath): + + samples = () + with open(fpath, 'r') as f_handle: + for line in f_handle: + sample = line.split(' ')[1] + samples += (sample,) + + return samples + + EXTERNAL_DIR = Path("data/external") RAW_DIR = Path("data/raw") INTERIM_DIR = Path("data/interim") PROCESSED_DIR = Path("data/processed") +PROJECTS_PATH = Path(f"/data/project/worthey_lab/projects/") +# PROJECTS_PATH = Path(f"/data/project/worthey_lab/projects/{PROJECT}/analysis") +PROJECT_NAME = "MuscDyst_SU_MAlexander" +SAMPLES = get_samples(RAW_DIR / f"ped/{PROJECT_NAME}.ped") +print (SAMPLES) + +############## MODULES ############## +include: "rules/somalier.smk" +######################################### + + +############ CONSTRAINTS ############ +wildcard_constraints: + sample = '|'.join(SAMPLES), + project = PROJECT_NAME +######################################### + rule all: input: + expand(str(PROCESSED_DIR / "somalier/{project}/somalier.html"), + project=[PROJECT_NAME]), + expand(str(PROCESSED_DIR / "somalier/{project}/somalier.somalier-ancestry.html"), + project=[PROJECT_NAME]), diff --git a/workflow/rules/somalier.smk b/workflow/rules/somalier.smk index 8d162e260b1a7ed67c3b4dd9d2cac6c1b2a3c94c..fffa89d1aa574038681988058dc386e88db5c860 100644 --- a/workflow/rules/somalier.smk +++ b/workflow/rules/somalier.smk @@ -1,20 +1,21 @@ rule extract: input: - # TODO - vcf = "" + vcf = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "vcf" / "{sample}.vcf.gz", + vcf_index = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "vcf" / "{sample}.vcf.gz.tbi", somalier_tool = config['somalier']['tool'], sites = config['somalier']['sites'], ref_genome = config['somalier']['ref'], output: - TODO - # message: + INTERIM_DIR / "somalier_extract/{project}/{sample}.somalier" + message: + "Running somalier extract. Project: {wildcards.project}" params: - outdir = lambda wildcards, output: Path(output[[0]]).parent, + outdir = lambda wildcards, output: Path(output[0]).parent, shell: r""" {input.somalier_tool} extract \ --sites {input.sites} \ - --fasta {input.ref} \ + --fasta {input.ref_genome} \ --out-dir {params.outdir} \ {input.vcf} """ @@ -22,15 +23,17 @@ rule extract: rule relate: input: - # TODO - extracted = "", - # ped = "" + extracted = expand(str(INTERIM_DIR / "somalier_extract/{{project}}/{sample}.somalier"), + sample=SAMPLES), + # ped = RAW_DIR / "ped" / "{project}.ped", somalier_tool = config['somalier']['tool'], output: - TODO - # message: + expand(str(PROCESSED_DIR / "somalier/{{project}}/somalier.{ext}"), + ext=['html', 'groups.tsv', 'pairs.tsv', 'samples.tsv']), + message: + "Running somalier relate. Project: {wildcards.project}" params: - outdir = lambda wildcards, output: Path(output[[0]]).parent, + outdir = lambda wildcards, output: Path(output[0]).parent, shell: r""" {input.somalier_tool} relate \ @@ -38,25 +41,28 @@ rule relate: --output-prefix {params.outdir}/somalier \ {input.extracted} """ - # --ped {input.ped} \ + # --ped {input.ped} \ rule ancestry: input: - # TODO - extracted="" + extracted = expand(str(INTERIM_DIR / "somalier_extract/{{project}}/{sample}.somalier"), + sample=SAMPLES), somalier_tool = config['somalier']['tool'], labels_1kg = config['somalier']['labels_1kg'], - somalier_1kg = config['somalier']['somalier_1kg'], + somalier_1kg = directory(config['somalier']['somalier_1kg']), output: - # message: + expand(str(PROCESSED_DIR / "somalier/{{project}}/somalier.somalier-ancestry.{ext}"), + ext=['html', 'tsv']), + message: + "Running somalier ancestry. Project: {wildcards.project}" params: - outdir = lambda wildcards, output: Path(output[[0]]).parent, + outdir = lambda wildcards, output: Path(output[0]).parent, shell: r""" {input.somalier_tool} ancestry \ --output-prefix {params.outdir}/somalier \ --labels {input.labels_1kg} \ - {input.somalier_1kg} ++ \ + {input.somalier_1kg}*.somalier ++ \ {input.extracted} """