-
Curtis Hendrickson authored3d3fdb83
Snakefile 8.91 KiB
#
# SARS-CoV-2 genome
#
#
# to run
#
# module load snakemakeslurm/5.9.1-4
# snakemakeslurm --use-conda --use-singularity
#
from snakemake.remote.NCBI import RemoteProvider as NCBIRemoteProvider
my_NCBI_Email='@'.join([os.environ['USER'],'uab.edu'])
NCBI = NCBIRemoteProvider(email=my_NCBI_Email) # email required by NCBI to prevent abuse
cov2acc="NC_045512.2"
cov2abbrev="cov2WuHan1"
MINIMAP2_PRESETS=["map-ont","map-pb","asm5","asm20"]
STAR_VERS=["2.5.3a"]
BWA_VERS=["0.7.17"]
MINIMAP2_VERS=["2.17"]
rule all:
input:
# SARS-CoV-2 download from Genbank and convert GFF3->GTF
cov2=expand("{strain}/{acc}.{ext}",strain=[cov2abbrev],acc=[cov2acc],ext=["fasta", "gff3", "gtf","rRNA_gene_list.txt","dict"])
# samtools faidx
,Idx=expand("{strain}/{genome}.{ext}",strain=[cov2abbrev],genome=[cov2acc],ext=["fasta.fai"]) #,"dict"])
# star index
,StarIDX=expand("{strain}/{genome}/STAR/STAR_{align_ver}-{overhang}/{index_file}",strain=[cov2abbrev],genome=[cov2acc],overhang=["100"],index_file=["SAindex"],align_ver=STAR_VERS)
# bwa index
# MISSING - liamvdp did by hand
,BWAIdx=expand("{strain}/BWA/{bwa_ver}/{genome}.{ext}",strain=[cov2abbrev],genome=[cov2acc],ext=["fasta.fai","fasta.bwt","fasta.sa"],bwa_ver=BWA_VERS)#, "dict"])
# minimap2 (ont & pb)
,minimap2=expand("{strain}/minimap2/minimap2_{align_ver}/{preset}/{genome}.mmi", preset=MINIMAP2_PRESETS, strain=[cov2abbrev], genome=[cov2acc], align_ver=MINIMAP_VERS )
ruleorder: AGAT_gff2gtf > AGAT_gff2ext_cleaner > gunzip
ruleorder: gtf_get_rRNA_gene_list > gunzip
rule gunzip:
wildcard_constraints: file=".+(?<!gz)" # not ends with gz
output: "{dir}/{file}"
input: "{dir}/{file}.gz"
shell: "gunzip -c '{input}' > '{output}'"
#
# pull rRNA genes
#
rule gtf_get_rRNA_gene_list:
output: "{dir}/{genome}.rRNA_gene_list.txt"
input: "{dir}/{genome}.gtf"
shell:
"awk -e 'BEGIN{{FS=\"\\t\"}}($3=\"CDS\"){{print $9}}' {input} "
# grep will exit=1 if no lines match, so use awk
#" | grep rRNA "
" | awk -e '/rRNA/{{print}}' "
" | cut -d \; -f 1 "
" | uniq "
" | cut -d \\\" -f 2 "
# grep will exit=1 if no lines match, so use awk
#" | grep -v '^$' "
" | awk -e '($0!=\"\"){{print}}' "
" > {output}"
# ----------------------------------------------------------------------
#
# SARS-Cov-2
#
# PULL FROM GENBANK
#
# ----------------------------------------------------------------------
rule ncbi_remote_fasta:
wildcard_constraints: acc="[A-Z_]+[0-9]+\.[0-9]+"
input: NCBI.remote(["{acc}.fasta"], db="nuccore")
output: "{dir}/{acc}.fasta"
shell: "mv {input} {output}"
# no gff3 support for nuccore in NCBI.remote
# note: this will also fetch fastas, if changes report=gff3 to report=fasta
rule pull_genbank_gff3:
wildcard_constraints: acc="[A-Z_]+[0-9]+\.[0-9]+"
output: "{dir}/{acc}.gff3"
shell: "wget -O '{output}' 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&save=file&log$=seqview&db=nuccore&report=gff3&id={wildcards.acc}&extrafeat=null&conwithfeat=on&hide-cdd=on'"
#
# GFF3 -> GTF conversion (AGAT package)
#
rule AGAT_gff2gtf:
wildcard_constraints: file=".*", dir="cov2.*"
output: "{dir}/{file}.agat_raw.gtf"
input: "{dir}/{file}.gff3"
# not working; see: https://github.com/NBISweden/AGAT/issues/4
#conda: "envs/AGAT.yaml"
singularity: "docker://quay.io/biocontainers/agat:0.2.3--pl526r35_1"
# use "_sp_" (SLURP) verion instead of "_sq_" (sequential), as memory is not an issue
shell:
"agat_convert_sp_gff2gtf.pl --gff {input} "
"> {output}"
rule AGAT_gff2ext_cleaner:
wildcard_constraints: dir="cov2.*", file=".*"
output: "{dir}/{file}.gtf"
input: "{dir}/{file}.agat_raw.gtf"
shell:
"cat '{input}' "
# remove warning from AGAT about " Primary tag values (3rd column) not expected => region"
"| egrep -v '^-' "
# remove parser chatter
"| egrep -v '^(converting to GTF3|Bye Bye)' "
"> {output}"
# ----------------------------------------------------------------------
#
# STAR index
#
# ----------------------------------------------------------------------
rule STAR_index_genome_with_gtf:
output:
expand("{{strain}}/{{genome}}/STAR/STAR_{align_ver}-{{overhang}}/{index_file}",
index_file=["SAindex", "SA", "Genome", "sjdbInfo.txt"])
input:
fa="{strain}/{genome}.fasta",
gtf="{strain}/{genome}.gtf"
# MUST match that from CCTS-Informatics-Pipelines / Snakemake-Eukaryotic-RNAseq-Pipeline
# which comes from a plug-in. We grabbed the yaml file from .snakemake/conda/8a33389f.yaml after a run.
conda: "envs/STAR_{align_ver}.yaml"
threads: 12
log:
stdout="{strain}/{genome}/STAR/STAR_{align_ver}-{overhang}/log_{overhang}_stdout.txt"
, stderr="{strain}/{genome}/STAR/STAR_{align_ver}a-{overhang}/log_{overhang}_stderr.txt"
shell:
"STAR "
" --runThreadN {threads} "
" --runMode genomeGenerate "
" --outFileNamePrefix $(dirname {output[0]}) "
" --genomeDir $(dirname {output[0]}) "
" --genomeFastaFiles {input.fa} "
" --sjdbGTFfile {input.gtf} "
" --sjdbOverhang {wildcards.overhang}"
" > {log.stdout} 2> {log.stderr} "
# ----------------------------------------------------------------------
#
# STAR index
#
# ----------------------------------------------------------------------
rule BWA_index_genome:
wildcard_constraints: ext="fasta"
output:
expand("{{strain}}/BWA/{{bwa_ver}}/{{genome}}.{{ext}}.{index_file}",
index_file=["amb", "pac", "ann", "sa","bwt"])
input:
fa="{strain}/{genome}.{ext}",
conda: "envs/bwa.{bwa_ver}.yaml"
shell:
"mkdir -p $(dirname {output[1]}) "
"&& cd $(dirname {output[1]}) "
"&& ln -fs ../../$(basename {input.fa}) . "
"&& bwa index $(basename {input.fa}) "
# ----------------------------------------------------------------------
#
# SAMtools FAI index
#
# ----------------------------------------------------------------------
rule SAMtools_index_fasta:
wildcard_constraints: ext="(fa|fasta|fna)"
output: "{file}.{ext}.fai"
input: "{file}.{ext}"
conda: "envs/samtools.yaml"
shell: "samtools faidx {input}"
# ----------------------------------------------------------------------
#
# Picard/GATK4 .dict index
#
# How to merge these two rules so input can be .fasta or .fna w/o dup rule?
# ----------------------------------------------------------------------
rule gatk4_fasta_dict:
#wildcard_constraints: ext="(fa|fasta|fna)"
output: "{dir}/{file}.dict"
input: fasta="{dir}/{file}.fasta"
conda: "envs/gatk4.yaml"
shell:
"JAVA_XMX= "
" && "
"if [ ! -z ${{SLURM_MEM_PER_CPU+x}} ]; then JAVA_XMX=-Xmx$(( $SLURM_MEM_PER_CPU / 1000 * $SLURM_CPUS_ON_NODE ))g; fi "
" && "
#"java $JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp -jar $EBROOTGATK/GenomeAnalysisTK.jar "
' gatk --java-options "$JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp" '
" CreateSequenceDictionary "
" -R {input.fasta} "
" -O {output} "
rule gatk4_fna_dict:
#wildcard_constraints: ext="(fa|fasta|fna)"
output: "{dir}/{file}.dict"
input: fasta="{dir}/{file}.fna"
conda: "envs/gatk4.yaml"
shell:
"JAVA_XMX= "
" && "
"if [ ! -z ${{SLURM_MEM_PER_CPU+x}} ]; then JAVA_XMX=-Xmx$(( $SLURM_MEM_PER_CPU / 1000 * $SLURM_CPUS_ON_NODE ))g; fi "
" && "
#"java $JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp -jar $EBROOTGATK/GenomeAnalysisTK.jar "
' gatk --java-options "$JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp" '
" CreateSequenceDictionary "
" -R {input.fasta} "
" -O {output} "
######################################################################
#
# Minimap2 index
#
######################################################################
# convenience rule
localrules: minimap2
rule minimap2:
input: rules.all.input.minimap2
# input: mmi=expand("{strain}/minimap2/minimap2_2.17/{preset}/{genome}.mmi", preset=MINIMAP2_PRESETS, strain=[cov2abbrev], genome=[cov2acc] )
#
# Minimap2 index fasta
#
# -t INT Number of threads [3].
# Minimap2 uses at most three threads when indexing target sequences,
# and uses up to INT+1 threads when mapping (the extra thread is for I/O,
# which is frequently idle and takes little CPU time)
#
# hg38 uses about 14G vmem
# rule.threads * cluster.slurm.cheaha.json:rule.mem-per-cpu-mb
rule minimap2_index_preset:
input: fa="{strain}/{genome}.fasta"
output: mmi="{strain}/minimap2/minimap2_{align_ver}/{preset}/{genome}.mmi"
threads: 3
conda: "envs/minimap2_{align_ver}.yaml"
shell:
"minimap2 "
" -t {threads} "
" -x {wildcards.preset} "
" -d {output.mmi} "
" {input.fa} "