From 012cbbe7c4f7cc7f200562c41c5f7d9c133c9a1d Mon Sep 17 00:00:00 2001
From: Manavalan Gajapathy <manag@uab.edu>
Date: Thu, 11 Feb 2021 09:38:16 -0600
Subject: [PATCH] gets somalier working

---
 .gitignore                  |  8 +------
 configs/workflow.yaml       |  4 ++--
 workflow/Snakefile          | 35 ++++++++++++++++++++++++++++-
 workflow/rules/somalier.smk | 44 +++++++++++++++++++++----------------
 4 files changed, 62 insertions(+), 29 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5df7def..d5cdec7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,17 +72,11 @@ target/
 .ipynb_checkpoints/
 
 # exclude data from source control by default
-# data/
-variant_annotation/data/
+data/
 
 #snakemake
 .snakemake/
 
-
-# exclude test data used for development
-to_be_deleted/test_data/data/ref
-to_be_deleted/test_data/data/reads
-
 #logs
 logs/
 
diff --git a/configs/workflow.yaml b/configs/workflow.yaml
index 22d858b..c846ab9 100644
--- a/configs/workflow.yaml
+++ b/configs/workflow.yaml
@@ -1,6 +1,6 @@
-- somalier:
+somalier:
     tool: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/somalier"
     sites: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/sites/sites.hg38.vcf.gz"
     labels_1kg: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/ancestry/ancestry-labels-1kg.tsv"
-    somalier_1kg: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/ancestry/somalier_1kg/*.somalier"
+    somalier_1kg: "/data/project/worthey_lab/projects/experimental_pipelines/mana/tools/somalier/0.2.12/ancestry/1kg-somalier/"
     ref: "/data/project/worthey_lab/datasets_central/human_reference_genome/processed/GRCh38/no_alt_rel20190408/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index f7cae27..d593579 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -7,18 +7,51 @@ module load snakemake/5.9.1-foss-2018b-Python-3.6.6
 snakemake -s workflow/Snakefile --use-conda -p
 '''
 from pathlib import Path
+import pandas as pd
 
 WORKFLOW_PATH = Path(workflow.basedir).parent
-
 configfile: WORKFLOW_PATH / "configs/workflow.yaml"
 
+
+def get_samples(fpath):
+
+    samples = ()
+    with open(fpath, 'r') as f_handle:
+        for line in f_handle:
+            sample = line.split(' ')[1]
+            samples += (sample,)
+
+    return samples
+
+
 EXTERNAL_DIR = Path("data/external")
 RAW_DIR = Path("data/raw")
 INTERIM_DIR = Path("data/interim")
 PROCESSED_DIR = Path("data/processed")
 
+PROJECTS_PATH = Path(f"/data/project/worthey_lab/projects/")
+# PROJECTS_PATH = Path(f"/data/project/worthey_lab/projects/{PROJECT}/analysis")
+PROJECT_NAME = "MuscDyst_SU_MAlexander"
+SAMPLES = get_samples(RAW_DIR / f"ped/{PROJECT_NAME}.ped")
+print (SAMPLES)
+
+##############   MODULES   ##############
+include: "rules/somalier.smk"
+#########################################
+
+
+############   CONSTRAINTS   ############
+wildcard_constraints:
+    sample = '|'.join(SAMPLES),
+    project = PROJECT_NAME
+#########################################
+
 
 rule all:
     input:
+        expand(str(PROCESSED_DIR / "somalier/{project}/somalier.html"),
+            project=[PROJECT_NAME]),
+        expand(str(PROCESSED_DIR / "somalier/{project}/somalier.somalier-ancestry.html"),
+            project=[PROJECT_NAME]),
 
 
diff --git a/workflow/rules/somalier.smk b/workflow/rules/somalier.smk
index 8d162e2..fffa89d 100644
--- a/workflow/rules/somalier.smk
+++ b/workflow/rules/somalier.smk
@@ -1,20 +1,21 @@
 rule extract:
     input:
-        # TODO
-        vcf = ""
+        vcf = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "vcf" / "{sample}.vcf.gz",
+        vcf_index = PROJECTS_PATH / "{project}" / "analysis" / "{sample}" / "vcf" / "{sample}.vcf.gz.tbi",
         somalier_tool = config['somalier']['tool'],
         sites = config['somalier']['sites'],
         ref_genome = config['somalier']['ref'],
     output:
-        TODO
-    # message:
+        INTERIM_DIR / "somalier_extract/{project}/{sample}.somalier"
+    message:
+        "Running somalier extract. Project: {wildcards.project}"
     params:
-        outdir = lambda wildcards, output: Path(output[[0]]).parent,
+        outdir = lambda wildcards, output: Path(output[0]).parent,
     shell:
         r"""
         {input.somalier_tool} extract \
             --sites {input.sites} \
-            --fasta {input.ref} \
+            --fasta {input.ref_genome} \
             --out-dir {params.outdir} \
             {input.vcf}
         """
@@ -22,15 +23,17 @@ rule extract:
 
 rule relate:
     input:
-        # TODO
-        extracted = "",
-        # ped = ""
+        extracted = expand(str(INTERIM_DIR / "somalier_extract/{{project}}/{sample}.somalier"),
+                        sample=SAMPLES),
+        # ped = RAW_DIR / "ped" / "{project}.ped",
         somalier_tool = config['somalier']['tool'],
     output:
-        TODO
-    # message:
+        expand(str(PROCESSED_DIR / "somalier/{{project}}/somalier.{ext}"),
+                ext=['html', 'groups.tsv', 'pairs.tsv', 'samples.tsv']),
+    message:
+        "Running somalier relate. Project: {wildcards.project}"
     params:
-        outdir = lambda wildcards, output: Path(output[[0]]).parent,
+        outdir = lambda wildcards, output: Path(output[0]).parent,
     shell:
         r"""
         {input.somalier_tool} relate \
@@ -38,25 +41,28 @@ rule relate:
             --output-prefix {params.outdir}/somalier \
             {input.extracted}
         """
-        # --ped {input.ped} \
+            # --ped {input.ped} \
 
 
 rule ancestry:
     input:
-        # TODO
-        extracted=""
+        extracted = expand(str(INTERIM_DIR / "somalier_extract/{{project}}/{sample}.somalier"),
+                        sample=SAMPLES),
         somalier_tool = config['somalier']['tool'],
         labels_1kg = config['somalier']['labels_1kg'],
-        somalier_1kg = config['somalier']['somalier_1kg'],
+        somalier_1kg = directory(config['somalier']['somalier_1kg']),
     output:
-    # message:
+        expand(str(PROCESSED_DIR / "somalier/{{project}}/somalier.somalier-ancestry.{ext}"),
+                ext=['html', 'tsv']),
+    message:
+        "Running somalier ancestry. Project: {wildcards.project}"
     params:
-        outdir = lambda wildcards, output: Path(output[[0]]).parent,
+        outdir = lambda wildcards, output: Path(output[0]).parent,
     shell:
         r"""
         {input.somalier_tool} ancestry \
             --output-prefix {params.outdir}/somalier \
             --labels {input.labels_1kg} \
-            {input.somalier_1kg} ++ \
+            {input.somalier_1kg}*.somalier ++ \
             {input.extracted}
         """
-- 
GitLab