Skip to content
Snippets Groups Projects
Snakefile 8.91 KiB
#
# SARS-CoV-2 genome
#
#
# to run
#
# module load snakemakeslurm/5.9.1-4
# snakemakeslurm --use-conda --use-singularity
#
from snakemake.remote.NCBI import RemoteProvider as NCBIRemoteProvider
my_NCBI_Email='@'.join([os.environ['USER'],'uab.edu'])
NCBI = NCBIRemoteProvider(email=my_NCBI_Email) # email required by NCBI to prevent abuse


cov2acc="NC_045512.2"
cov2abbrev="cov2WuHan1"

MINIMAP2_PRESETS=["map-ont","map-pb","asm5","asm20"]

STAR_VERS=["2.5.3a"]
BWA_VERS=["0.7.17"]
MINIMAP2_VERS=["2.17"]

rule all:
    input: 
        # SARS-CoV-2 download from Genbank and convert GFF3->GTF
        cov2=expand("{strain}/{acc}.{ext}",strain=[cov2abbrev],acc=[cov2acc],ext=["fasta", "gff3", "gtf","rRNA_gene_list.txt","dict"])
        # samtools faidx
        ,Idx=expand("{strain}/{genome}.{ext}",strain=[cov2abbrev],genome=[cov2acc],ext=["fasta.fai"]) #,"dict"]) 
        # star index
        ,StarIDX=expand("{strain}/{genome}/STAR/STAR_{align_ver}-{overhang}/{index_file}",strain=[cov2abbrev],genome=[cov2acc],overhang=["100"],index_file=["SAindex"],align_ver=STAR_VERS) 
        # bwa index
        # MISSING - liamvdp did by hand
        ,BWAIdx=expand("{strain}/BWA/{bwa_ver}/{genome}.{ext}",strain=[cov2abbrev],genome=[cov2acc],ext=["fasta.fai","fasta.bwt","fasta.sa"],bwa_ver=BWA_VERS)#, "dict"]) 
        # minimap2 (ont & pb)
        ,minimap2=expand("{strain}/minimap2/minimap2_{align_ver}/{preset}/{genome}.mmi", preset=MINIMAP2_PRESETS, strain=[cov2abbrev], genome=[cov2acc], align_ver=MINIMAP_VERS )

ruleorder: AGAT_gff2gtf > AGAT_gff2ext_cleaner > gunzip
ruleorder: gtf_get_rRNA_gene_list > gunzip

rule gunzip:
    wildcard_constraints: file=".+(?<!gz)" # not ends with gz
    output: "{dir}/{file}"
    input: "{dir}/{file}.gz"
    shell: "gunzip -c '{input}' > '{output}'"

#
# pull rRNA genes
#
rule gtf_get_rRNA_gene_list:
    output: "{dir}/{genome}.rRNA_gene_list.txt"
    input: "{dir}/{genome}.gtf"
    shell:
        "awk -e 'BEGIN{{FS=\"\\t\"}}($3=\"CDS\"){{print $9}}' {input} "
        # grep will exit=1 if no lines match, so use awk
        #" | grep rRNA "
        " | awk -e '/rRNA/{{print}}' "
        " | cut -d \; -f 1 "
        " | uniq "
        " | cut -d \\\" -f 2 "
        # grep will exit=1 if no lines match, so use awk
        #" | grep -v '^$' "
        " | awk -e '($0!=\"\"){{print}}'  "
        " > {output}"

# ----------------------------------------------------------------------
#
# SARS-Cov-2
# 
# PULL FROM GENBANK
#
# ----------------------------------------------------------------------
rule ncbi_remote_fasta: 
    wildcard_constraints: acc="[A-Z_]+[0-9]+\.[0-9]+"    
    input:  NCBI.remote(["{acc}.fasta"], db="nuccore")
    output: "{dir}/{acc}.fasta"
    shell: "mv {input} {output}"

# no gff3 support for nuccore in NCBI.remote
# note: this will also fetch fastas, if changes report=gff3 to report=fasta
rule pull_genbank_gff3:
    wildcard_constraints: acc="[A-Z_]+[0-9]+\.[0-9]+"    
    output: "{dir}/{acc}.gff3"
    shell: "wget -O '{output}' 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&save=file&log$=seqview&db=nuccore&report=gff3&id={wildcards.acc}&extrafeat=null&conwithfeat=on&hide-cdd=on'"

#
# GFF3 -> GTF conversion (AGAT package)
# 
rule AGAT_gff2gtf:
    wildcard_constraints: file=".*", dir="cov2.*"
    output: "{dir}/{file}.agat_raw.gtf"
    input: "{dir}/{file}.gff3"
    # not working; see: https://github.com/NBISweden/AGAT/issues/4
    #conda: "envs/AGAT.yaml"
    singularity: "docker://quay.io/biocontainers/agat:0.2.3--pl526r35_1"
    # use "_sp_" (SLURP) verion instead of "_sq_" (sequential), as memory is not an issue
    shell: 
        "agat_convert_sp_gff2gtf.pl --gff {input} "
        "> {output}"

rule AGAT_gff2ext_cleaner:
    wildcard_constraints: dir="cov2.*", file=".*"
    output: "{dir}/{file}.gtf"
    input: "{dir}/{file}.agat_raw.gtf"
    shell:
        "cat '{input}' "
        # remove warning from AGAT about " Primary tag values (3rd column) not expected => region"
        "| egrep -v '^-' "
        # remove parser chatter
        "| egrep -v '^(converting to GTF3|Bye Bye)'  "
        "> {output}"

# ----------------------------------------------------------------------
#
# STAR index
#
# ----------------------------------------------------------------------
rule STAR_index_genome_with_gtf:
    output: 
        expand("{{strain}}/{{genome}}/STAR/STAR_{align_ver}-{{overhang}}/{index_file}",
            index_file=["SAindex", "SA", "Genome", "sjdbInfo.txt"])
    input: 
        fa="{strain}/{genome}.fasta",
        gtf="{strain}/{genome}.gtf"
    # MUST match that from CCTS-Informatics-Pipelines / Snakemake-Eukaryotic-RNAseq-Pipeline
    # which comes from a plug-in. We grabbed the yaml file from .snakemake/conda/8a33389f.yaml after a run.
    conda: "envs/STAR_{align_ver}.yaml"
    threads: 12
    log: 
        stdout="{strain}/{genome}/STAR/STAR_{align_ver}-{overhang}/log_{overhang}_stdout.txt"
        , stderr="{strain}/{genome}/STAR/STAR_{align_ver}a-{overhang}/log_{overhang}_stderr.txt"
    shell: 
        "STAR "
        "    --runThreadN {threads} "
        "    --runMode genomeGenerate "
        "    --outFileNamePrefix $(dirname {output[0]}) "
        "    --genomeDir $(dirname {output[0]}) "
        "    --genomeFastaFiles {input.fa} "
        "    --sjdbGTFfile {input.gtf} "
        "    --sjdbOverhang {wildcards.overhang}"
        " > {log.stdout} 2> {log.stderr} "


# ----------------------------------------------------------------------
#
# STAR index
#
# ----------------------------------------------------------------------

rule BWA_index_genome:
    wildcard_constraints: ext="fasta"
    output: 
        expand("{{strain}}/BWA/{{bwa_ver}}/{{genome}}.{{ext}}.{index_file}",
               index_file=["amb", "pac", "ann", "sa","bwt"])
    input: 
        fa="{strain}/{genome}.{ext}",
    conda: "envs/bwa.{bwa_ver}.yaml"
    shell: 
        "mkdir -p $(dirname {output[1]}) "
        "&& cd $(dirname {output[1]}) "
        "&& ln -fs ../../$(basename {input.fa}) . "
        "&& bwa index  $(basename {input.fa}) "



# ----------------------------------------------------------------------
#
# SAMtools FAI index
#
# ----------------------------------------------------------------------
rule SAMtools_index_fasta:
    wildcard_constraints: ext="(fa|fasta|fna)"
    output: "{file}.{ext}.fai"
    input: "{file}.{ext}"
    conda: "envs/samtools.yaml"
    shell: "samtools faidx {input}"

# ----------------------------------------------------------------------
#
# Picard/GATK4 .dict index
#
# How to merge these two rules so input can be .fasta or .fna w/o dup rule?
# ----------------------------------------------------------------------
rule gatk4_fasta_dict:
    #wildcard_constraints: ext="(fa|fasta|fna)"
    output: "{dir}/{file}.dict"
    input: fasta="{dir}/{file}.fasta"
    conda: "envs/gatk4.yaml"
    shell:
        "JAVA_XMX= "
        " && "
        "if [ ! -z ${{SLURM_MEM_PER_CPU+x}} ]; then JAVA_XMX=-Xmx$(( $SLURM_MEM_PER_CPU / 1000 * $SLURM_CPUS_ON_NODE ))g; fi "
        " && "
        #"java $JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp -jar $EBROOTGATK/GenomeAnalysisTK.jar "
        ' gatk --java-options "$JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp" '
        " CreateSequenceDictionary "
        "  -R {input.fasta} "
        "  -O {output} "

rule gatk4_fna_dict:
    #wildcard_constraints: ext="(fa|fasta|fna)"
    output: "{dir}/{file}.dict"
    input: fasta="{dir}/{file}.fna"
    conda: "envs/gatk4.yaml"
    shell:
        "JAVA_XMX= "
        " && "
        "if [ ! -z ${{SLURM_MEM_PER_CPU+x}} ]; then JAVA_XMX=-Xmx$(( $SLURM_MEM_PER_CPU / 1000 * $SLURM_CPUS_ON_NODE ))g; fi "
        " && "
        #"java $JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp -jar $EBROOTGATK/GenomeAnalysisTK.jar "
        ' gatk --java-options "$JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp" '
        " CreateSequenceDictionary "
        "  -R {input.fasta} "
        "  -O {output} "
   
######################################################################
#
# Minimap2 index
#
######################################################################

# convenience rule
localrules: minimap2
rule minimap2:
    input: rules.all.input.minimap2
#    input:  mmi=expand("{strain}/minimap2/minimap2_2.17/{preset}/{genome}.mmi", preset=MINIMAP2_PRESETS, strain=[cov2abbrev], genome=[cov2acc] )


#
# Minimap2 index fasta
#
# -t INT	Number of threads [3]. 
# Minimap2 uses at most three threads when indexing target sequences, 
# and uses up to INT+1 threads when mapping (the extra thread is for I/O, 
# which is frequently idle and takes little CPU time)
#
# hg38 uses about 14G vmem
#   rule.threads * cluster.slurm.cheaha.json:rule.mem-per-cpu-mb
         
rule minimap2_index_preset:
    input: fa="{strain}/{genome}.fasta"
    output: mmi="{strain}/minimap2/minimap2_{align_ver}/{preset}/{genome}.mmi"
    threads: 3
    conda: "envs/minimap2_{align_ver}.yaml"
    shell: 
        "minimap2 "
        " -t {threads} "
        " -x {wildcards.preset} "
        " -d {output.mmi} "
        " {input.fa} "