diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 37d82163233a011594ec21bd64f7e3e5e26e7c89..0000000000000000000000000000000000000000 --- a/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "configs/snakemake_slurm_profile"] - path = configs/snakemake_slurm_profile - url = git@gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/external-projects/snakemake_slurm_profile.git -[submodule "src/utility_cgds"] - path = src/utility_cgds - url = git@gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/utility-images.git diff --git a/.test/README.md b/.test/README.md index 0b5b466a3dd2a3ed78ac266ac13e8f37b2182d5e..759516c9017b94ef39d742496dc71b329f0acd1b 100644 --- a/.test/README.md +++ b/.test/README.md @@ -1,29 +1,34 @@ # Testing -Output from [Small variant caller -pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline) -are the inputs to QuaC pipeline. Hence following datasets are necessary for testing: +Input directory structure to QuaC is based on the output directory structure of the [Small variant caller +pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline). +Following files are necessary for testing: 1. bams 2. vcfs -3. QC output (from tools fastqc, fastq-screen and picard-markduplicates) -4. Sample rename config +3. Capture regions bed file - Required only for exome mode +4. QC output from tools fastqc, fastq-screen and picard-markduplicates - Required only if `priorQC` is used +5. Sample rename config - Required only if `priorQC` is used -Note: Be sure to preserve directory structure used in the output of Small variant caller +**Note**: If `priorQC` is used, be sure to preserve directory structure used in the output of CGDS Small variant caller pipeline. ## Setup test datasets -* To setup test bam and vcf files, which are from sub-sampled NA12878 data, run: +### Required + +* To setup test bam, vcf and capture region bed files, which are from sub-sampled NA12878 data, run: ```sh cd .test ./setup_test_datasets.sh ``` -* QuaC also needs test QC outputs for fastq (and sample rename config), which get created by small var caller pipeline. - This was achieved by running the small variant caller pipeline using its test datasets with some modifications. Steps - are briefly shown here: +### Optional - priorQC mode + +* If used in `priorQC` mode, QuaC also needs test QC outputs for fastq (and sample rename config), which at CGDS get + created by the small var caller pipeline. Below, we create fastq QC and sample rename config using the small variant + caller pipeline for samples `A` and `B`. ```sh cd <small_var_caller_pipeline_dir> diff --git a/.test/configs/project_1_sample.ped b/.test/configs/include_priorQC/project_1sample.ped similarity index 100% rename from .test/configs/project_1_sample.ped rename to .test/configs/include_priorQC/project_1sample.ped diff --git a/.test/configs/project_2_samples.ped b/.test/configs/include_priorQC/project_2samples.ped similarity index 100% rename from .test/configs/project_2_samples.ped rename to .test/configs/include_priorQC/project_2samples.ped diff --git a/.test/configs/no_priorQC/project_1sample.ped b/.test/configs/no_priorQC/project_1sample.ped new file mode 100644 index 0000000000000000000000000000000000000000..089113ca876d18fcae97a9e29da7e1f46af63e9b --- /dev/null +++ b/.test/configs/no_priorQC/project_1sample.ped @@ -0,0 +1,2 @@ +#family_id sample_id paternal_id maternal_id sex phenotype +unknown C father_1 mother_1 -9 -9 diff --git a/.test/configs/no_priorQC/project_2samples.ped b/.test/configs/no_priorQC/project_2samples.ped new file mode 100644 index 0000000000000000000000000000000000000000..487faa75d2ff05eb3f215de99bd45602e0c9b455 --- /dev/null +++ b/.test/configs/no_priorQC/project_2samples.ped @@ -0,0 +1,3 @@ +#family_id sample_id paternal_id maternal_id sex phenotype +unknown C father_1 mother_1 -9 -9 +unknown D father_1 mother_1 -9 -9 diff --git a/.test/ngs-data/test_project/analysis/A/qc/dedup/A-1.metrics.txt b/.test/ngs-data/test_project/analysis/A/qc/dedup/A-1.metrics.txt index 19b693bb07aa00ae3be931f5a13778b84893ead1..455913aa97907f0ea4a0cbe4a78f93843506c275 100644 --- a/.test/ngs-data/test_project/analysis/A/qc/dedup/A-1.metrics.txt +++ b/.test/ngs-data/test_project/analysis/A/qc/dedup/A-1.metrics.txt @@ -1,5 +1,5 @@ ## htsjdk.samtools.metrics.StringHeader -# MarkDuplicates INPUT=[/data/scratch/manag/test_pipeline/small_variant_caller/interim/A/mapped/A-1.sorted.bam] OUTPUT=/data/scratch/manag/test_pipeline/small_variant_caller/interim/A/dedup/A-1.bam METRICS_FILE=/data/scratch/manag/test_pipeline/small_variant_caller/analysis/A/qc/dedup/A-1.metrics.txt REMOVE_DUPLICATES=true TMP_DIR=[/tmp] MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +# MarkDuplicates INPUT=[/test_pipeline/small_variant_caller/interim/A/mapped/A-1.sorted.bam] OUTPUT=/test_pipeline/small_variant_caller/interim/A/dedup/A-1.bam METRICS_FILE=/test_pipeline/small_variant_caller/analysis/A/qc/dedup/A-1.metrics.txt REMOVE_DUPLICATES=true TMP_DIR=[/tmp] MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false ## htsjdk.samtools.metrics.StringHeader # Started on: Fri Apr 02 19:39:58 UTC 2021 diff --git a/.test/ngs-data/test_project/analysis/A/qc/dedup/A-2.metrics.txt b/.test/ngs-data/test_project/analysis/A/qc/dedup/A-2.metrics.txt index ce24b754ad346218941e37f60fe710a4504187bd..73fe318ae22d92e9ea34c1755b0727a47bb6d4d0 100644 --- a/.test/ngs-data/test_project/analysis/A/qc/dedup/A-2.metrics.txt +++ b/.test/ngs-data/test_project/analysis/A/qc/dedup/A-2.metrics.txt @@ -1,5 +1,5 @@ ## htsjdk.samtools.metrics.StringHeader -# MarkDuplicates INPUT=[/data/scratch/manag/test_pipeline/small_variant_caller/interim/A/mapped/A-2.sorted.bam] OUTPUT=/data/scratch/manag/test_pipeline/small_variant_caller/interim/A/dedup/A-2.bam METRICS_FILE=/data/scratch/manag/test_pipeline/small_variant_caller/analysis/A/qc/dedup/A-2.metrics.txt REMOVE_DUPLICATES=true TMP_DIR=[/tmp] MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +# MarkDuplicates INPUT=[/test_pipeline/small_variant_caller/interim/A/mapped/A-2.sorted.bam] OUTPUT=/test_pipeline/small_variant_caller/interim/A/dedup/A-2.bam METRICS_FILE=/test_pipeline/small_variant_caller/analysis/A/qc/dedup/A-2.metrics.txt REMOVE_DUPLICATES=true TMP_DIR=[/tmp] MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false ## htsjdk.samtools.metrics.StringHeader # Started on: Fri Apr 02 19:40:06 UTC 2021 diff --git a/.test/ngs-data/test_project/analysis/B/qc/dedup/B-1.metrics.txt b/.test/ngs-data/test_project/analysis/B/qc/dedup/B-1.metrics.txt index f797efe7c843e84de747ce0f8cd59560b71dcd39..7e56ee539c2d90905bba731ca231ea00a36250bc 100644 --- a/.test/ngs-data/test_project/analysis/B/qc/dedup/B-1.metrics.txt +++ b/.test/ngs-data/test_project/analysis/B/qc/dedup/B-1.metrics.txt @@ -1,5 +1,5 @@ ## htsjdk.samtools.metrics.StringHeader -# MarkDuplicates INPUT=[/data/scratch/manag/test_pipeline/small_variant_caller/interim/B/mapped/B-1.sorted.bam] OUTPUT=/data/scratch/manag/test_pipeline/small_variant_caller/interim/B/dedup/B-1.bam METRICS_FILE=/data/scratch/manag/test_pipeline/small_variant_caller/analysis/B/qc/dedup/B-1.metrics.txt REMOVE_DUPLICATES=true TMP_DIR=[/tmp] MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +# MarkDuplicates INPUT=[/test_pipeline/small_variant_caller/interim/B/mapped/B-1.sorted.bam] OUTPUT=/test_pipeline/small_variant_caller/interim/B/dedup/B-1.bam METRICS_FILE=/test_pipeline/small_variant_caller/analysis/B/qc/dedup/B-1.metrics.txt REMOVE_DUPLICATES=true TMP_DIR=[/tmp] MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false ## htsjdk.samtools.metrics.StringHeader # Started on: Fri Apr 02 19:39:58 UTC 2021 diff --git a/.test/ngs-data/test_project/analysis/B/qc/dedup/B-2.metrics.txt b/.test/ngs-data/test_project/analysis/B/qc/dedup/B-2.metrics.txt index cd7da956144a3175aa1d4041e70e776d650ba09b..f01c0b9947f95a6d7a20486e4b9ca1edd3aff7d4 100644 --- a/.test/ngs-data/test_project/analysis/B/qc/dedup/B-2.metrics.txt +++ b/.test/ngs-data/test_project/analysis/B/qc/dedup/B-2.metrics.txt @@ -1,5 +1,5 @@ ## htsjdk.samtools.metrics.StringHeader -# MarkDuplicates INPUT=[/data/scratch/manag/test_pipeline/small_variant_caller/interim/B/mapped/B-2.sorted.bam] OUTPUT=/data/scratch/manag/test_pipeline/small_variant_caller/interim/B/dedup/B-2.bam METRICS_FILE=/data/scratch/manag/test_pipeline/small_variant_caller/analysis/B/qc/dedup/B-2.metrics.txt REMOVE_DUPLICATES=true TMP_DIR=[/tmp] MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +# MarkDuplicates INPUT=[/test_pipeline/small_variant_caller/interim/B/mapped/B-2.sorted.bam] OUTPUT=/test_pipeline/small_variant_caller/interim/B/dedup/B-2.bam METRICS_FILE=/test_pipeline/small_variant_caller/analysis/B/qc/dedup/B-2.metrics.txt REMOVE_DUPLICATES=true TMP_DIR=[/tmp] MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false ## htsjdk.samtools.metrics.StringHeader # Started on: Fri Apr 02 19:40:06 UTC 2021 diff --git a/.test/ngs-data/test_project/analysis/C/bam/C.bam b/.test/ngs-data/test_project/analysis/C/bam/C.bam new file mode 100644 index 0000000000000000000000000000000000000000..7f45cc67d59e2a2fabb0d1714b0ba7d2fbe03356 Binary files /dev/null and b/.test/ngs-data/test_project/analysis/C/bam/C.bam differ diff --git a/.test/ngs-data/test_project/analysis/C/bam/C.bam.bai b/.test/ngs-data/test_project/analysis/C/bam/C.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..9d8a8bc0af8a66c1c21f55fe6eec3f0e46bcd84e Binary files /dev/null and b/.test/ngs-data/test_project/analysis/C/bam/C.bam.bai differ diff --git a/.test/ngs-data/test_project/analysis/C/configs/small_variant_caller/capture_regions.bed b/.test/ngs-data/test_project/analysis/C/configs/small_variant_caller/capture_regions.bed new file mode 100644 index 0000000000000000000000000000000000000000..d1ae1edfd071338d22e0fea33241a989f3c441ef --- /dev/null +++ b/.test/ngs-data/test_project/analysis/C/configs/small_variant_caller/capture_regions.bed @@ -0,0 +1,2 @@ +chr20 59992 3653078 + diff --git a/.test/ngs-data/test_project/analysis/C/vcf/C.vcf.gz b/.test/ngs-data/test_project/analysis/C/vcf/C.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..b88140f02881d358008c155fc81272e53dfea19c Binary files /dev/null and b/.test/ngs-data/test_project/analysis/C/vcf/C.vcf.gz differ diff --git a/.test/ngs-data/test_project/analysis/C/vcf/C.vcf.gz.tbi b/.test/ngs-data/test_project/analysis/C/vcf/C.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..caeb9ef1fa509ce233eb5854e4fc79c0ee38c4dc Binary files /dev/null and b/.test/ngs-data/test_project/analysis/C/vcf/C.vcf.gz.tbi differ diff --git a/.test/ngs-data/test_project/analysis/D/bam/D.bam b/.test/ngs-data/test_project/analysis/D/bam/D.bam new file mode 100644 index 0000000000000000000000000000000000000000..e99c07f146d0c1abcd7c7d6b6400f90af6012c05 Binary files /dev/null and b/.test/ngs-data/test_project/analysis/D/bam/D.bam differ diff --git a/.test/ngs-data/test_project/analysis/D/bam/D.bam.bai b/.test/ngs-data/test_project/analysis/D/bam/D.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..5f9872d165c27346d6db7bba5397adb338e9a783 Binary files /dev/null and b/.test/ngs-data/test_project/analysis/D/bam/D.bam.bai differ diff --git a/.test/ngs-data/test_project/analysis/D/configs/small_variant_caller/capture_regions.bed b/.test/ngs-data/test_project/analysis/D/configs/small_variant_caller/capture_regions.bed new file mode 100644 index 0000000000000000000000000000000000000000..d1ae1edfd071338d22e0fea33241a989f3c441ef --- /dev/null +++ b/.test/ngs-data/test_project/analysis/D/configs/small_variant_caller/capture_regions.bed @@ -0,0 +1,2 @@ +chr20 59992 3653078 + diff --git a/.test/ngs-data/test_project/analysis/D/vcf/D.vcf.gz b/.test/ngs-data/test_project/analysis/D/vcf/D.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..b88140f02881d358008c155fc81272e53dfea19c Binary files /dev/null and b/.test/ngs-data/test_project/analysis/D/vcf/D.vcf.gz differ diff --git a/.test/ngs-data/test_project/analysis/D/vcf/D.vcf.gz.tbi b/.test/ngs-data/test_project/analysis/D/vcf/D.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..caeb9ef1fa509ce233eb5854e4fc79c0ee38c4dc Binary files /dev/null and b/.test/ngs-data/test_project/analysis/D/vcf/D.vcf.gz.tbi differ diff --git a/.test/setup_test_datasets.sh b/.test/setup_test_datasets.sh index d7df7fbbfe96fcbce5a2e688c2f3762ca2d61371..0e2efb635a6efc073b54c278933c26d24a33af24 100755 --- a/.test/setup_test_datasets.sh +++ b/.test/setup_test_datasets.sh @@ -19,7 +19,8 @@ TARGET_REGION="chr20:59993-3653078" samtools view -s 0.03 -b $NA12878_BAM $TARGET_REGION > $SUBSAMPLED_BAM PROJECT_DIR="ngs-data/test_project/analysis" -for sample in A B; do +SAMPLES="A B C D" +for sample in $SAMPLES; do ### bams ### BAM_DIR="${PROJECT_DIR}/${sample}/bam" mkdir -p $BAM_DIR @@ -44,7 +45,7 @@ rm -f $SUBSAMPLED_BAM echo "Setting up test vcf files..." NA12878_VCF="/data/project/worthey_lab/samples/NA12878/analysis/small_variants/na12878.vcf.gz" -for sample in A B; do +for sample in $SAMPLES; do VCF_DIR="${PROJECT_DIR}/${sample}/vcf" mkdir -p $VCF_DIR OUT_vcf=${VCF_DIR}/${sample}.vcf.gz @@ -57,7 +58,9 @@ done ############# Regions file ############# -# Treat sample B as exome dataset and add a capture-regions bed file -CAPTURE_FILE="${PROJECT_DIR}/B/configs/small_variant_caller/capture_regions.bed" -mkdir -p $(dirname $CAPTURE_FILE) -echo -e "chr20\t59992\t3653078\n" > $CAPTURE_FILE +# For exome mode testing, add capture-regions bed file +for sample in $SAMPLES; do + CAPTURE_FILE="${PROJECT_DIR}/${sample}/configs/small_variant_caller/capture_regions.bed" + mkdir -p $(dirname $CAPTURE_FILE) + echo -e "chr20\t59992\t3653078\n" > $CAPTURE_FILE +done \ No newline at end of file diff --git a/Changelog.md b/Changelog.md index 2b16286871f80debee19c5071876364e3d768f52..00a80e4ef52c8a5063ca0d8d9d1211e9cd9538d2 100644 --- a/Changelog.md +++ b/Changelog.md @@ -39,4 +39,16 @@ YYYY-MM-DD John Doe 2022-04-07 Manavalan Gajapathy * Previously hardcoded hardware resources for snakemake rules can now be supplied via `configs/workflow.yaml` (closes #48) -* Modified multiqc conda env config to use explicit dependencies to get around installation issues (closes #47) \ No newline at end of file +* Modified multiqc conda env config to use explicit dependencies to get around installation issues (closes #47) + + +2023-01-20 Manavalan Gajapathy + +As part of making QuaC publicly available, following updates were made to make it more generic to the environment and user friendly: + +* Removes prerun QC from small variant caller pipeline as requirement to QuaC (closes #45) +* Explicitly defines conda environments (closes #49) +* Uses container solution for `covviz` installation instead of conda to avoid pip based installation (closes #52) +* Removes git submodules and instead saves their local copy to repo (closes #53) +* Loads singularity module loading prior to executing the runner script +* Uses minimal snakemake instead of full-featured snakemake (closes #56) \ No newline at end of file diff --git a/README.md b/README.md index 81a5c59a41bc22b7ac04030e07e50f17896684e9..1497ded2ce6dd7b5df191ca22a67cbdbd8f0b730 100644 --- a/README.md +++ b/README.md @@ -1,55 +1,27 @@ -- [QuaC](#quac) - - [What is QuaC?](#what-is-quac) - - [QC tools included](#qc-tools-included) - - [QuaC-Watch](#quac-watch) - - [Installation](#installation) - - [Requirements](#requirements) - - [Retrieve pipeline source code](#retrieve-pipeline-source-code) - - [Create conda environment](#create-conda-environment) - - [How to run QuaC](#how-to-run-quac) - - [Requirements](#requirements-1) - - [Set up workflow config file](#set-up-workflow-config-file) - - [Prepare verifybamid datasets for exome analysis](#prepare-verifybamid-datasets-for-exome-analysis) - - [Run pipeline](#run-pipeline) - - [Create singularity+conda environments for tools used in QuaC pipeline](#create-singularityconda-environments-for-tools-used-in-quac-pipeline) - - [Input requirements](#input-requirements) - - [Example usage](#example-usage) - - [Output](#output) - - [Testing pipeline](#testing-pipeline) - - [How to run](#how-to-run) - - [Expected output files](#expected-output-files) - - [Visualization of workflow](#visualization-of-workflow) - - [Contributing](#contributing) - - [Changelog](#changelog) - # QuaC 🦆🦆 Don't duck that QC thingy 🦆🦆 ## What is QuaC? -QuaC is a snakemake-based pipeline that runs several QC tools and summarizes results for WGS/WES samples processed at -CGDS using internally defined QC thresholds. It is a companion pipeline that should be run after samples in a project -are run through [CGDS's small variant caller -pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline). - -In short, QuaC performs the following: +QuaC is a snakemake-based **pipeline** that runs several QC tools for WGS/WES samples and then summarizes their results +using pre-defined, configurable QC thresholds. -- Runs various QC tools using data produced by the small variant caller pipeline -- Using *QuaC-Watch* tool, it performs QC checkup based on the expected thresholds and summarizes the results for - easy consumption -- Aggregates QC output produced here as well as those by the small variant caller pipeline using mulitqc, both at sample - level and project level +In summary, QuaC performs the following: -**Note**: - -1. While QuaC does the heavy lifting in performing QC, the small variant caller pipeline also runs few QC tools (fastqc, - fastq-screen, picard's markduplicates). This setup was chosen deliberately for pragmatic reasons. -2. *Use QuaC-Watch results with extreme caution when run in exome mode.* Though QuaC can be run in exome mode, - QuaC-Watch thresholds utilized are not yet as reliable as that used for WGS datasets. +- Runs several QC tools using `BAM` and `VCF` files as input. At our center CGDS, these files are produced as part of + the [small variant caller + pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline). +- Using *QuaC-Watch* tool, it performs QC checkup based on the expected thresholds for certain QC metrics and summarizes + the results for easier human consumption +- Aggregates QC output produced here as well as those by the small variant caller pipeline using mulitqc, both at the + sample level and project level. +- Optionally, above mentioned QC checkup and QC aggregation steps can accept pre-run results from few QC tools (fastqc, + fastq-screen, picard's markduplicates). At CGDS, these files are produced as part of the [small variant caller + pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline). -### QC tools included +### QC tools utilized QuaC quacks using the tools listed below: @@ -57,7 +29,7 @@ QuaC quacks using the tools listed below: | -------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | | *BAM QC* | | | [Qualimap](http://qualimap.conesalab.org/) | QCs alignment data in SAM/BAM files | -| [Picard-CollectMultipleMetrics](https://broadinstitute.github.io/picard/command-line-overview.html#CollectMultipleMetrics) | summarizes alignment metrics from a SAM/BAM file using several modules | +| [Picard-CollectMultipleMetrics](https://broadinstitute.github.io/picard/command-line-overview.html#CollectMultipleMetrics) | Summarizes alignment metrics from a SAM/BAM file using several modules | | [Picard-CollectWgsMetrics](https://broadinstitute.github.io/picard/command-line-overview.html#CollectWgsMetrics) | Collects metrics about coverage and performance of whole genome sequencing (WGS) experiments. | | [mosdepth](https://github.com/brentp/mosdepth) | Fast BAM/CRAM depth calculation | | [indexcov](https://github.com/brentp/goleft/tree/master/indexcov) | Estimate coverage from whole-genome bam or cram index (Not run in exome mode) | @@ -69,9 +41,11 @@ QuaC quacks using the tools listed below: | *Sex, ancestry and relatedness estimation* | | | [somalier](https://github.com/brentp/somalier) | Estimation of sex, ancestry and relatedness | +### Optional tools' results consumption -In addition to this, QuaC also utilizes QC results produced by following tools as part of the [small variant caller -pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/small_variant_caller_pipeline/). +In addition to the above tools, optionally QuaC can also utilize QC results produced by the tools below when run with +flag `--include_prior_qc`. At CGDS, these files are produced as part of the [small variant caller +pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline). | Tool | Use | | ------------------------------------------------------------------------------------------------------------ | ------------------------------------------------- | @@ -83,10 +57,10 @@ pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-s ### QuaC-Watch QuaC includes a tool called QuaC-Watch. After running all the QC tools for samples, QuaC-Watch summarizes if samples -have passed the QC thresholds (defined via config file -[`wgs_quac_watch_config.yaml`](configs/quac_watch/wgs_quac_watch_config.yaml); can be user-configured), both at the -sample level as well as project level. This summary makes it easy to quickly review if sample or samples have sufficient -quality and highlight samples that need further review. +have passed the configurable QC thresholds defined using config files (available at +[`configs/quac_watch/`](./configs/quac_watch/)), both at the sample level as well as project level. This summary makes +it easy to quickly review whether sample or samples are of sufficient quality and highlight samples that need further +review. ## Installation @@ -102,20 +76,17 @@ Installation requires - [SSH Key for access](https://docs.uabgrid.uab.edu/wiki/Cheaha_GettingStarted#Logging_in_to_Cheaha) to Cheaha cluster - Anaconda/miniconda - Tested with Anaconda3/2020.02 + - Available as module from cheaha at UAB ### Retrieve pipeline source code -This repository use git submodules, which need to be pulled when cloning. Go to the directory of your choice and run the -command below. +Go to the directory of your choice and run the command below. ```sh git clone -b master \ - --recurse-submodules \ git@gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/pipelines/quac.git ``` -Note that downloading this repository from GitLab, instead of cloning, may not fetch the submodules included. - ### Create conda environment @@ -124,11 +95,13 @@ Conda environment will install all necessary dependencies, including snakemake, ```sh cd /path/to/quac/repo +# For use only at Cheaha in UAB. Load conda into environment. module reset module load Anaconda3/2020.02 # create conda environment. Needed only the first time. conda env create --file configs/env/quac.yaml + # activate conda environment conda activate quac @@ -143,17 +116,17 @@ In order to run the QuaC pipeline, user needs to 1. Install the pipeline and set up the conda environment ([see above](#installation)) 2. Set up config files specifying paths required by QC tools used in the pipeline. -3. Run QuaC pipeline just to create singularity+conda environments using testing dataset (optional) +3. Run QuaC pipeline just to create singularity+conda environments using the testing dataset (optional) ### Requirements ***Direct*** -- Snakemake +- Snakemake-minimal - Tested with v6.0.5 - Gets installed as part of conda environment - Python - - Tested with v3.6.3 + - Tested with v3.6.13 - Gets installed as part of conda environment - slurmpy - Tested with v0.0.8 @@ -163,10 +136,10 @@ In order to run the QuaC pipeline, user needs to - Anaconda/miniconda - Tested with Anaconda3/2020.02 - - Available as module from cheaha + - Available as module from cheaha at UAB - Singularity - Tested with v3.5.2 - - Will be loaded as a module when running QuaC + - Available as module from cheaha at UAB Tools below are used in the QuaC pipeline, and snakemake automatically installs them in conda environments inside the singularity container. Therefore, they don't need to be manually installed. For tool versions used, refer to the @@ -185,25 +158,13 @@ snakemake rules. ### Set up workflow config file -QuaC requires a workflow config file in yaml format ([`configs/workflow.yaml`](./configs/workflow.yaml)), which provides filepaths to necessary -dataset dependencies required by certain QC tools. In addition, hardware resources can be configured (refer to [`configs/workflow.yaml`](./configs/workflow.y) for more info). File format should look like: - -```yaml -datasets: - ref: "path to ref genome path" - somalier: - sites: "path to somalier's site file" - labels_1kg: "path to somalier's ancestry-labels-1kg file" - somalier_1kg: "dirpath to somalier's 1kg-somalier files" - verifyBamID: - svd_dat_wgs: "path to WGS resources .dat files" - svd_dat_exome: "path to exome resources .dat files" - -#### hardware resources #### -resources: - ... - ... -``` +QuaC requires a workflow config file in yaml format (default is [`configs/workflow.yaml`](./configs/workflow.yaml)), +which provides: + +- Filepaths to necessary dataset dependencies required by certain QC tools +- Hardware resource configs + +Custom workflow config file can be provided to QuaC via `--workflow_config`. #### Prepare verifybamid datasets for exome analysis @@ -227,10 +188,11 @@ After activating the conda environment, QuaC pipeline can be run using the wrapp all the options available: ```sh -$ ./src/run_quac.py -h +$ python src/run_quac.py -h usage: run_quac.py [-h] [--project_name] [--projects_path] [--pedigree] [--quac_watch_config] [--workflow_config] [--outdir] - [--exome] [--cluster_config] [--log_dir] [-e] [-n] [-l] + [--exome] [--include_prior_qc] [--allow_sample_renaming] + [--cluster_config] [--log_dir] [-e] [-n] [-l] [--rerun_failed] [--slurm_partition] Wrapper tool for QuaC pipeline. @@ -252,9 +214,14 @@ QuaC workflow options: of tools used in QuaC (default: configs/workflow.yaml) --outdir Out directory path (default: $USER_SCRATCH/tmp/quac/results/test_project/analysis) - --exome Flag to run in exome mode. WARNING: Please provide - appropriate configs via --quac_watch_config. (default: - False) + --exome Flag to run the workflow in exome mode. WARNING: + Please provide appropriate configs via + --quac_watch_config. (default: False) + --include_prior_qc Flag to additionally use prior QC data as input. See + documentation for more info. (default: False) + --allow_sample_renaming + Flag to allow sample renaming in MultiQC report. See + documentation for more info. (default: False) QuaC wrapper options: --cluster_config Cluster config json file. Needed for snakemake to run @@ -279,11 +246,12 @@ QuaC wrapper options: medium(max 50 hrs), long(max 150 hrs) (default: short) ``` -To run the wrapper script, which in turn will execute the QuaC pipeline: +Minimal example to run the wrapper script, which in turn will execute the QuaC pipeline: ```sh module reset module load Anaconda3/2020.02 +module load Singularity/3.5.2-GCC-5.4.0-2.26 conda activate quac python src/run_quac.py \ @@ -307,7 +275,7 @@ Besides the basic features, wrapper script [`src/run_quac.py`](./src/run_quac.py ### Create singularity+conda environments for tools used in QuaC pipeline All the jobs initiated by QuaC would be run inside a conda environment, which themselves were created inside a -singularity container. It may be a good idea to create these environments even before they are run with actual samples. +singularity container. It may be a good idea to create these environments before they are run with actual samples. While this step is optional, this will ensure that there will not be any conflicts when running multiple instances of the pipeline. @@ -318,13 +286,14 @@ provided. ```sh module reset module load Anaconda3/2020.02 +module load Singularity/3.5.2-GCC-5.4.0-2.26 conda activate quac # WGS mode python src/run_quac.py \ --project_name test_project \ --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/project_2_samples.ped" \ + --pedigree ".test/configs/no_priorQC/project_2samples.ped" \ --outdir "$USER_SCRATCH/tmp/quac/results/test_project_wgs/analysis" \ -e="--conda-create-envs-only" ``` @@ -333,48 +302,58 @@ python src/run_quac.py \ ### Input requirements - Pedigree file supplied via `--pedigree`. Only the samples that are supplied in pedigree file will be processed by QuaC - and all of these samples must belong to the same project. This repo also includes a handy script - [`src/create_dummy_ped.py`](src/create_dummy_ped.py) that can create a dummy pedigree file, which will lack sex - (unless project tracking sheet is provided), relatedness and affected status info. See header of the script for usage - instructions. Note that we plan to use [phenotips](https://phenotips.com/) in future to produce fully capable pedigree - file. One could manually create them as well, but this would be error-prone. + and all of these samples must belong to the same project. + - *For CGDS use only*: This repo includes a handy script [`src/create_dummy_ped.py`](src/create_dummy_ped.py) that can + create a dummy pedigree file, which will lack sex (unless project tracking sheet is provided), relatedness and + affected status info. See header of the script for usage instructions. Note that we plan to use + [phenotips](https://phenotips.com/) in future to produce fully capable pedigree file. One could manually create them + as well, but this could be error-prone. - Output produced by [the small variant caller pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline). This includes bam, vcf and QC output. Refer to [test sample dataset](.test/ngs-data/test_project/analysis/A), which is representative of the input required. -- [QuaC config file](#set-up-workflow-config-file) + +- QuaC workflow config file. Refer to [section here](#set-up-workflow-config-file) for more info. + - When run in exome mode, QuaC requires a capture-regions bed file at path - `path_to_sample/configs/small_variant_caller/<capture_regions>.bed`. + `path_to_sample/configs/small_variant_caller/<capture_regions>.bed` for each sample. ### Example usage ```sh -# to quack on a WGS project +# to quack on a WGS project, which also has prior QC data +PROJECT="Quack_Quack" python src/run_quac.py \ - --project_name CF_CFF_PFarrell \ - --pedigree "data/raw/ped/CF_CFF_PFarrell.ped" + --project_name $PROJECT \ + --pedigree "data/raw/ped/${PROJECT}.ped" \ + --include_prior_qc \ + --allow_sample_renaming + -# to quack on a WGS project and write results to a dir of choice -PROJECT="CF_CFF_PFarrell" +# to quack on a WGS project, run in a medium slurm partition and write results to a dir of choice +PROJECT="Quack_This" python src/run_quac.py \ --slurm_partition medium \ - --project_name ${PROJECT} \ + --project_name $PROJECT \ --pedigree "data/raw/ped/${PROJECT}.ped" \ - --outdir "/data/scratch/manag/tmp/quac/results/test_${PROJECT}/analysis" + --outdir "$USER_SCRATCH/tmp/quac/results/test_${PROJECT}/analysis" + # to quack on an exome project +PROJECT="Quack_That" python src/run_quac.py \ - --project_name HCC \ - --pedigree "data/raw/ped/HCC.ped" \ + --project_name $PROJECT \ + --pedigree "data/raw/ped/${PROJECT}.ped" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ --exome -# to quack on an exome project which is not in the default CGDS projects_path +# to quack on an exome project by providing path to that project +PROJECT="Quack_That" python src/run_quac.py \ - --project_name UnusualCancers_CMGalluzi \ - --projects_path "/data/project/sloss/cgds_path_cmgalluzzi/" \ - --pedigree "data/raw/ped/UnusualCancers_CMGalluzi.ped" \ + --project_name $PROJECT \ + --projects_path "/path/to/project/${$PROJECT}/" \ + --pedigree "data/raw/ped/${PROJECT}.ped" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ --exome ``` @@ -383,21 +362,22 @@ python src/run_quac.py \ QuaC results are stored at the path specified via option `--outdir` (default: `$USER_SCRATCH/tmp/quac/results/test_project/analysis`). Refer to the [testing's output](#expected-output-files) to -learn about output directory structure. Most important output files are aggregated QC results produced by -[multiqc](https://multiqc.info/), both at sample-level as well as at the project-level. These multiqc reports also -include summary of QuaC-Watch results. +learn more about the output directory structure. Users may primarily be interested in the the aggregated QC results +produced by [multiqc](https://multiqc.info/), both at sample-level as well as at the project-level. These multiqc +reports also include summary of QuaC-Watch results. -Note that QuaC's output directory structure takes the output structure of the small variant caller pipeline. +Note that QuaC's output directory structure has been designed based on the output structure of the [small variant caller +pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline). ## Testing pipeline The system-level testing implemented for this pipeline tests whether the pipeline runs from start to finish without any error. This testing uses test datasets present in [`.test/ngs-data/test_project`](.test/ngs-data/test_project), which -reflects a test project containing two samples. [See here](.test/README.md) for more info on how these test datasets -were created. +reflects a test project containing four samples (2 with input needed when `include_priorQC` is used and 2 other samples +without priorQC data). [See here](.test/README.md) for more info on how these test datasets were created. -> **_NOTE:_** This testing does not verify that pipeline's output are correct. Instead, its purpose is just to ensure -> that pipeline runs from beginning to end without any execution error for the given test dataset. +> **_NOTE:_** This testing does not verify that pipeline's output are correct. Instead, its purpose is to ensure that +> pipeline runs from beginning to end without any execution error for the given test dataset. ### How to run @@ -405,27 +385,56 @@ were created. ```sh module reset module load Anaconda3/2020.02 +module load Singularity/3.5.2-GCC-5.4.0-2.26 conda activate quac +########## No prior QC data involved ########## +PROJECT_CONFIG="project_2samples" +PRIOR_QC_STATUS="no_priorQC" + # WGS mode -PROJECT="project_2_samples" python src/run_quac.py \ --project_name test_project \ --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PROJECT}.ped" \ - --outdir "$USER_SCRATCH/tmp/quac/results/test_${PROJECT}_wgs/analysis" + --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --outdir "$USER_SCRATCH/tmp/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" # Exome mode python src/run_quac.py \ --project_name test_project \ --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PROJECT}.ped" \ - --outdir "$USER_SCRATCH/tmp/quac/results/test_${PROJECT}_exome/analysis" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --outdir "$USER_SCRATCH/tmp/quac/results/test_${PROJECT_CONFIG}_exome-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ --exome + + +########## Includes prior QC data and allows sample renaming ########## +PROJECT_CONFIG="project_2samples" +PRIOR_QC_STATUS="include_priorQC" + +# WGS mode +python src/run_quac.py \ + --project_name test_project \ + --projects_path ".test/ngs-data/" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --outdir "$USER_SCRATCH/tmp/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" \ + --include_prior_qc \ + --allow_sample_renaming + +# Exome mode +python src/run_quac.py \ + --project_name test_project \ + --projects_path ".test/ngs-data/" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --outdir "$USER_SCRATCH/tmp/quac/results/test_${PROJECT_CONFIG}_exome-${PRIOR_QC_STATUS}/analysis" \ + --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ + --include_prior_qc \ + --allow_sample_renaming \ + --exome ``` -Note: Use `PROJECT="project_1_sample"` to test out a project with only one sample. +Note: Use `PROJECT="project_1sample"` to test out a project with only one sample. ### Expected output files @@ -479,7 +488,7 @@ $ tree $USER_SCRATCH/tmp/quac/results/test_project_2_samples/ -d -L 4 └── ... ``` -Certain tools (eg. indexcov and covviz) are not executed when QuaC is run in exome mode (`--exome`). +Note: Certain tools (eg. indexcov and covviz) are not executed when QuaC is run in exome mode (`--exome`). ## Visualization of workflow @@ -495,7 +504,9 @@ srun --ntasks=1 --cpus-per-task=1 --mem-per-cpu=4096 --partition=express --pty / # setup environment module reset module load Anaconda3/2020.02 +module load Singularity/3.5.2-GCC-5.4.0-2.26 conda activate quac + DAG_DIR="pipeline_visualized" ###### WGS mode ###### @@ -540,3 +551,10 @@ If you like to make changes to the source code, please see the [contribution gui ## Changelog See [here](./Changelog.md). + +## Repo owner + +* *Mana*valan Gajapathy + + + diff --git a/configs/env/bcftools.yaml b/configs/env/bcftools.yaml index 655c10730679f2b1efc6490ee513e3b6078c3578..a53bbb66b515c60fb8877748e26e97587c0ed1f4 100644 --- a/configs/env/bcftools.yaml +++ b/configs/env/bcftools.yaml @@ -3,3 +3,31 @@ channels: - bioconda dependencies: - bcftools =1.12 + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bzip2=1.0.8 + - c-ares=1.17.2 + - ca-certificates=2021.5.30 + - gsl=2.6 + - htslib=1.12 + - krb5=1.19.2 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.78.0 + - libdeflate=1.7 + - libedit=3.1.20191231 + - libev=4.33 + - libgcc-ng=11.1.0 + - libgfortran-ng=11.1.0 + - libgfortran5=11.1.0 + - libgomp=11.1.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.17 + - libssh2=1.9.0 + - libstdcxx-ng=11.1.0 + - ncurses=6.2 + - openssl=1.1.1k + - perl=5.32.1 + - tk=8.6.10 + - xz=5.2.5 diff --git a/configs/env/covviz.yaml b/configs/env/covviz.yaml deleted file mode 100644 index 5f8e6404d3ca7b22246999079481c4d085fba1d9..0000000000000000000000000000000000000000 --- a/configs/env/covviz.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: covviz - -channels: - - conda-forge - -dependencies: - - python==3.8.5 - - pip - - pip: - - covviz==1.2.2 diff --git a/configs/env/goleft.yaml b/configs/env/goleft.yaml index c1010183ea987985b625dbc7ec3060e54ccb2dd1..56aa771246f5a7186189551f51995ce7c48eb8b9 100644 --- a/configs/env/goleft.yaml +++ b/configs/env/goleft.yaml @@ -3,3 +3,25 @@ channels: - bioconda dependencies: - goleft ==0.2.4 + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bzip2=1.0.8 + - c-ares=1.17.2 + - ca-certificates=2021.5.30 + - htslib=1.13 + - krb5=1.19.2 + - libcurl=7.78.0 + - libdeflate=1.7 + - libedit=3.1.20191231 + - libev=4.33 + - libgcc-ng=11.1.0 + - libgomp=11.1.0 + - libnghttp2=1.43.0 + - libssh2=1.9.0 + - libstdcxx-ng=11.1.0 + - ncurses=6.2 + - openssl=1.1.1k + - samtools=1.13 + - tk=8.6.10 + - xz=5.2.5 diff --git a/configs/env/mosdepth.yaml b/configs/env/mosdepth.yaml index 683254fed2d0c711765016b8a7c724f99fdd9363..552d37856619f38a350da8d9aeba0aa7c1ca18a8 100644 --- a/configs/env/mosdepth.yaml +++ b/configs/env/mosdepth.yaml @@ -6,3 +6,25 @@ channels: dependencies: - mosdepth==0.3.1 + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bzip2=1.0.8 + - c-ares=1.17.2 + - ca-certificates=2021.5.30 + - htslib=1.12 + - krb5=1.19.2 + - libcurl=7.78.0 + - libdeflate=1.7 + - libedit=3.1.20191231 + - libev=4.33 + - libgcc-ng=11.1.0 + - libgomp=11.1.0 + - libnghttp2=1.43.0 + - libssh2=1.9.0 + - libstdcxx-ng=11.1.0 + - ncurses=6.2 + - openssl=1.1.1k + - pcre=8.45 + - tk=8.6.10 + - xz=5.2.5 diff --git a/configs/env/multiqc.yaml b/configs/env/multiqc.yaml index 92cfadff8c62f67ddc654adf8eb8d67a89c7c041..a0b43ffb31bb6668dfcef59c7f5bfe1bd2e7afb0 100644 --- a/configs/env/multiqc.yaml +++ b/configs/env/multiqc.yaml @@ -5,6 +5,7 @@ channels: dependencies: - python=3.6.13 - multiqc==1.9 + # other dependencies - networkx=2.5 - numpy=1.19.5 - _libgcc_mutex=0.1 diff --git a/configs/env/picard.yaml b/configs/env/picard.yaml index 6a3a4a6560fa7a38f683dcefe304c055fa473f6e..26f653a5ebd1eab35dc29a116d33f4af928866f6 100644 --- a/configs/env/picard.yaml +++ b/configs/env/picard.yaml @@ -8,3 +8,105 @@ dependencies: - picard==2.23.0 - openjdk==11.0.9.1 - r-base==4.0.3 + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.3 + - binutils_impl_linux-64=2.36.1 + - binutils_linux-64=2.36 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.2 + - ca-certificates=2021.5.30 + - cairo=1.16.0 + - curl=7.76.1 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.4.0 + - gcc_linux-64=9.4.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.4.0 + - gfortran_linux-64=9.4.0 + - giflib=5.2.1 + - graphite2=1.3.13 + - gsl=2.6 + - gxx_impl_linux-64=9.4.0 + - gxx_linux-64=9.4.0 + - harfbuzz=2.8.2 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.17.2 + - lcms2=2.12 + - ld_impl_linux-64=2.36.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.76.1 + - libdeflate=1.7 + - libedit=3.1.20191231 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.4.0 + - libgcc-ng=11.1.0 + - libgfortran-ng=11.1.0 + - libgfortran5=11.1.0 + - libglib=2.68.3 + - libgomp=11.1.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.17 + - libpng=1.6.37 + - libsanitizer=9.4.0 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.4.0 + - libstdcxx-ng=11.1.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.13 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1k + - pango=1.48.7 + - pcre=8.45 + - pcre2=10.36 + - pixman=0.40.0 + - pthread-stubs=0.4 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 diff --git a/configs/env/picard_smk.yaml b/configs/env/picard_smk.yaml new file mode 100644 index 0000000000000000000000000000000000000000..80918af1816e1c581946038b3924a2b7d40d38e4 --- /dev/null +++ b/configs/env/picard_smk.yaml @@ -0,0 +1,111 @@ +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - picard=2.23.0 + - snakemake-wrapper-utils=0.1.3 + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.3 + - binutils_impl_linux-64=2.36.1 + - binutils_linux-64=2.36 + - bwidget=1.9.14 + - bzip2=1.0.8 + - ca-certificates=2021.5.30 + - cairo=1.16.0 + - certifi=2021.5.30 + - curl=7.71.1 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.4.0 + - gcc_linux-64=9.4.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.4.0 + - gfortran_linux-64=9.4.0 + - giflib=5.2.1 + - graphite2=1.3.13 + - gsl=2.6 + - gxx_impl_linux-64=9.4.0 + - gxx_linux-64=9.4.0 + - harfbuzz=2.8.2 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.17.2 + - lcms2=2.12 + - ld_impl_linux-64=2.36.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.71.1 + - libdeflate=1.7 + - libedit=3.1.20191231 + - libffi=3.3 + - libgcc-devel_linux-64=9.4.0 + - libgcc-ng=11.1.0 + - libgfortran-ng=11.1.0 + - libgfortran5=11.1.0 + - libglib=2.68.3 + - libgomp=11.1.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libopenblas=0.3.17 + - libpng=1.6.37 + - libsanitizer=9.4.0 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.4.0 + - libstdcxx-ng=11.1.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.13 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openjdk=11.0.9.1 + - openssl=1.1.1k + - pango=1.42.4 + - pcre=8.45 + - pcre2=10.36 + - pip=21.2.3 + - pixman=0.40.0 + - pthread-stubs=0.4 + - python=3.9.6 + - python_abi=3.9 + - r-base=4.0.3 + - readline=8.1 + - sed=4.8 + - setuptools=49.6.0 + - sqlite=3.36.0 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - tzdata=2021a + - wheel=0.37.0 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 diff --git a/configs/env/quac.yaml b/configs/env/quac.yaml index a5c0c9b73104f5d6d82b8248c129ce8f8106d20a..242b9ffe8df8ea5d79e9ee64a6a7d0f606ca4d67 100644 --- a/configs/env/quac.yaml +++ b/configs/env/quac.yaml @@ -9,7 +9,7 @@ dependencies: - black==20.8b1 - pylint==2.7.2 - bioconda::snakefmt==0.4.0 - - bioconda::snakemake==6.0.5 - - pip + - bioconda::snakemake-minimal==6.0.5 + - pip==21.0.1 - pip: - slurmpy==0.0.8 diff --git a/configs/env/quac_watch.yaml b/configs/env/quac_watch.yaml index 8d8ad5ebe0889d2f773e264c3500e61be1bd8898..1cc11eaf3f7c6e3d037cef0dbcb0ca6b71044404 100644 --- a/configs/env/quac_watch.yaml +++ b/configs/env/quac_watch.yaml @@ -3,9 +3,86 @@ channels: - anaconda - bioconda dependencies: - - python =3.6 - - pandas =1.1 + - python=3.6.13 + - pandas=1.1.5 + - numpy=1.19.5 - jinja2==2.11.3 - pyyaml=5.3 - fire=0.3 - multiqc=1.9 + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - brotlipy=0.7.0 + - ca-certificates=2021.5.30 + - certifi=2021.5.30 + - cffi=1.14.6 + - chardet=4.0.0 + - charset-normalizer=2.0.0 + - click=8.0.1 + - coloredlogs=15.0.1 + - colormath=3.0.0 + - cryptography=3.4.7 + - cycler=0.10.0 + - decorator=5.0.9 + - freetype=2.10.4 + - future=0.18.2 + - humanfriendly=9.2 + - idna=3.1 + - importlib-metadata=4.6.3 + - jbig=2.1 + - jpeg=9d + - kiwisolver=1.3.1 + - lcms2=2.12 + - ld_impl_linux-64=2.36.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libdeflate=1.7 + - libffi=3.3 + - libgcc-ng=11.1.0 + - libgfortran-ng=11.1.0 + - libgfortran5=11.1.0 + - libgomp=11.1.0 + - liblapack=3.9.0 + - libopenblas=0.3.17 + - libpng=1.6.37 + - libstdcxx-ng=11.1.0 + - libtiff=4.3.0 + - libwebp-base=1.2.0 + - lz4-c=1.9.3 + - lzstring=1.0.4 + - markdown=3.3.4 + - markupsafe=2.0.1 + - matplotlib-base=3.3.4 + - ncurses=6.2 + - networkx=2.5 + - olefile=0.46 + - openjpeg=2.4.0 + - openssl=1.1.1k + - pillow=8.3.1 + - pip=21.2.3 + - pycparser=2.20 + - pyopenssl=20.0.1 + - pyparsing=2.4.7 + - pysocks=1.7.1 + - python-dateutil=2.8.2 + - python_abi=3.6 + - pytz=2021.1 + - readline=8.1 + - requests=2.26.0 + - setuptools=49.6.0 + - simplejson=3.17.3 + - six=1.16.0 + - spectra=0.0.11 + - sqlite=3.36.0 + - termcolor=1.1.0 + - tk=8.6.10 + - tornado=6.1 + - typing_extensions=3.10.0.0 + - urllib3=1.26.6 + - wheel=0.37.0 + - xz=5.2.5 + - yaml=0.2.5 + - zipp=3.5.0 + - zlib=1.2.11 diff --git a/configs/env/qualimap.yaml b/configs/env/qualimap.yaml index 7d156ab4c653a2953454bd924a5cf1a4257207fe..42a4c26837ee09e118bba01b97f846531047b06a 100644 --- a/configs/env/qualimap.yaml +++ b/configs/env/qualimap.yaml @@ -1,6 +1,147 @@ - channels: - conda-forge - bioconda dependencies: - - qualimap =2.2 + - qualimap ==2.2.2d + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.3 + - binutils_impl_linux-64=2.36.1 + - binutils_linux-64=2.36 + - bioconductor-biobase=2.52.0 + - bioconductor-biocgenerics=0.38.0 + - bioconductor-biocio=1.2.0 + - bioconductor-biocparallel=1.26.0 + - bioconductor-biostrings=2.60.0 + - bioconductor-delayedarray=0.18.0 + - bioconductor-genomeinfodb=1.28.0 + - bioconductor-genomeinfodbdata=1.2.6 + - bioconductor-genomicalignments=1.28.0 + - bioconductor-genomicranges=1.44.0 + - bioconductor-iranges=2.26.0 + - bioconductor-matrixgenerics=1.4.0 + - bioconductor-noiseq=2.36.0 + - bioconductor-rhtslib=1.24.0 + - bioconductor-rsamtools=2.8.0 + - bioconductor-rtracklayer=1.52.0 + - bioconductor-s4vectors=0.30.0 + - bioconductor-summarizedexperiment=1.22.0 + - bioconductor-xvector=0.32.0 + - bioconductor-zlibbioc=1.38.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.2 + - ca-certificates=2021.5.30 + - cairo=1.16.0 + - curl=7.78.0 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.4.0 + - gcc_linux-64=9.4.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.4.0 + - gfortran_linux-64=9.4.0 + - giflib=5.2.1 + - graphite2=1.3.13 + - gsl=2.6 + - gxx_impl_linux-64=9.4.0 + - gxx_linux-64=9.4.0 + - harfbuzz=2.8.2 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.2 + - lcms2=2.12 + - ld_impl_linux-64=2.36.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.78.0 + - libdeflate=1.7 + - libedit=3.1.20191231 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.4.0 + - libgcc-ng=11.1.0 + - libgfortran-ng=11.1.0 + - libgfortran5=11.1.0 + - libglib=2.68.3 + - libgomp=11.1.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.17 + - libpng=1.6.37 + - libsanitizer=9.4.0 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.4.0 + - libstdcxx-ng=11.1.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.13 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openjdk=11.0.9.1 + - openssl=1.1.1k + - pango=1.48.7 + - pcre=8.45 + - pcre2=10.37 + - pixman=0.40.0 + - pthread-stubs=0.4 + - r-base=4.1.1 + - r-bh=1.75.0_0 + - r-bitops=1.0_7 + - r-crayon=1.4.1 + - r-formatr=1.11 + - r-futile.logger=1.4.3 + - r-futile.options=1.0.1 + - r-getopt=1.20.3 + - r-lambda.r=1.2.4 + - r-lattice=0.20_44 + - r-matrix=1.3_4 + - r-matrixstats=0.60.0 + - r-optparse=1.6.6 + - r-rcurl=1.98_1.3 + - r-restfulr=0.0.13 + - r-rjson=0.2.20 + - r-snow=0.4_3 + - r-xml=3.99_0.6 + - r-yaml=2.2.1 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 diff --git a/configs/env/samtools.yaml b/configs/env/samtools.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0024e6ba0a5247d4d3f89d804429193edc0f75a7 --- /dev/null +++ b/configs/env/samtools.yaml @@ -0,0 +1,25 @@ +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - samtools=1.10 + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bzip2=1.0.8 + - ca-certificates=2021.5.30 + - htslib=1.10.2 + - krb5=1.17.2 + - libcurl=7.71.1 + - libdeflate=1.6 + - libedit=3.1.20191231 + - libgcc-ng=11.1.0 + - libgomp=11.1.0 + - libssh2=1.9.0 + - libstdcxx-ng=11.1.0 + - ncurses=6.2 + - openssl=1.1.1k + - tk=8.6.10 + - xz=5.2.5 + - zlib=1.2.11 diff --git a/configs/env/verifyBamID.yaml b/configs/env/verifyBamID.yaml index f8b1e257c4e6f6077f380e7a1b120ddc6b9b839f..303558f6d514f51c32b372ac8aba73753ab615e5 100644 --- a/configs/env/verifyBamID.yaml +++ b/configs/env/verifyBamID.yaml @@ -7,3 +7,26 @@ channels: dependencies: - verifybamid2==2.0.1 - htslib==1.10.2 + # other dependencies + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bzip2=1.0.8 + - c-ares=1.17.2 + - ca-certificates=2021.5.30 + - curl=7.78.0 + - htslib=1.10.2 + - krb5=1.19.2 + - libcurl=7.78.0 + - libdeflate=1.6 + - libedit=3.1.20191231 + - libev=4.33 + - libgcc-ng=11.1.0 + - libgomp=11.1.0 + - libnghttp2=1.43.0 + - libssh2=1.9.0 + - libstdcxx-ng=11.1.0 + - ncurses=6.2 + - openssl=1.1.1k + - tk=8.6.10 + - verifybamid2=2.0.1 + - xz=5.2.5 diff --git a/configs/multiqc_config_template.jinja2 b/configs/multiqc_config_template.jinja2 index 3b2bb98129410bd3ef9ebd0c5852a586655df91b..9993aff707b45bd77cffcb467f3c58e7c880ae7b 100644 --- a/configs/multiqc_config_template.jinja2 +++ b/configs/multiqc_config_template.jinja2 @@ -70,14 +70,12 @@ qualimap_config: - 100 - 200 -# configs for QuaC-Watch -# NOTE: This multiqc config file has hard-coded parameters, which is based on the QuaC-Watch config file. -# If parameters in QuaC-Watch config file get modified, this file may need to be edited correspondingly. +# configs for QuaC-Watch (based on the QuaC-Watch config file). custom_data: quac_watch_overall_summary: parent_id: "quac_watch" parent_name: "QuaC-Watch" - parent_description: "This section contains QuaC-Watch results. QuaC-Watch summarizes if samples have passed the QC thresholds (Might not work well for WES)." + parent_description: "This section contains QuaC-Watch results. QuaC-Watch summarizes if samples have passed the QC thresholds." id: "quac_watch_overall_summary" section_name: "Overall QuaC-Watch Summary" description: "Overall QuaC-Watch summary of results from several QC tools" diff --git a/configs/snakemake_slurm_profile b/configs/snakemake_slurm_profile deleted file mode 160000 index 4ecaf55d398ebfdf8415dff50c26beea0237c34d..0000000000000000000000000000000000000000 --- a/configs/snakemake_slurm_profile +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4ecaf55d398ebfdf8415dff50c26beea0237c34d diff --git a/configs/workflow.yaml b/configs/workflow.yaml index 7fb7e5eeb3c24fe0ecf9e5907bb428b055e718d4..7d99f8e86d7f910899fcf5ec328a3c55b9fd9784 100644 --- a/configs/workflow.yaml +++ b/configs/workflow.yaml @@ -1,13 +1,16 @@ datasets: + # path to ref genome ref: "/data/project/worthey_lab/datasets_central/human_reference_genome/processed/GRCh38/no_alt_rel20190408/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" + # path to Somalier dataset dependencies somalier: sites: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/sites/sites.hg38.vcf.gz" labels_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/ancestry-labels-1kg.tsv" somalier_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/1kg-somalier/" + # path to verifyBamID dataset dependencies verifyBamID: svd_dat_wgs: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/wgs/1000g.phase3.100k.b38.vcf.gz.dat" svd_dat_exome: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/exome/chr_added/1000g.phase3.10k.b38.exome.vcf.gz.dat" - + #### hardware resources #### resources: qualimap_bamqc: diff --git a/src/quac_watch/quac_watch.py b/src/quac_watch/quac_watch.py index 28fd90a0dbfd98672309ac16ba2dfcfc16c2abe4..8fbeafca774046c592967b9800a6258de786a9cb 100644 --- a/src/quac_watch/quac_watch.py +++ b/src/quac_watch/quac_watch.py @@ -43,8 +43,11 @@ def main( # fastqc LOGGER.info("-" * 80) - fastqc_outfile = f"{out_filepath_prefix}_fastqc.yaml" - qc_checks_dict["fastqc"] = fastqc.fastqc(fastqc_f, config_dict["fastqc"], fastqc_outfile) + if fastqc_f is not None: + fastqc_outfile = f"{out_filepath_prefix}_fastqc.yaml" + qc_checks_dict["fastqc"] = fastqc.fastqc(fastqc_f, config_dict["fastqc"], fastqc_outfile) + else: + LOGGER.info("Skipping fastqc component as it was not supplied by the user") # read multiqc's general stats LOGGER.info("-" * 80) @@ -78,10 +81,13 @@ def main( # picard duplication LOGGER.info("-" * 80) - picard_dup_outfile = f"{out_filepath_prefix}_picard_dups.yaml" - qc_checks_dict["picard_dups"] = picard.duplication( - picard_dup_f, config_dict["picard"]["MarkDuplicates"], picard_dup_outfile - ) + if picard_dup_f is not None: + picard_dup_outfile = f"{out_filepath_prefix}_picard_dups.yaml" + qc_checks_dict["picard_dups"] = picard.duplication( + picard_dup_f, config_dict["picard"]["MarkDuplicates"], picard_dup_outfile + ) + else: + LOGGER.info("Skipping Picard-duplication component as it was not supplied by the user") # bcftools-stats LOGGER.info("-" * 80) @@ -120,10 +126,13 @@ def main( # fastq screen LOGGER.info("-" * 80) - fastq_screen_outfile = f"{out_filepath_prefix}_fastq_screen.yaml" - qc_checks_dict["fastq_screen"] = fastq_screen.fastq_screen( - fastq_screen_f, config_dict["fastq_screen"], fastq_screen_outfile - ) + if fastq_screen_f is not None: + fastq_screen_outfile = f"{out_filepath_prefix}_fastq_screen.yaml" + qc_checks_dict["fastq_screen"] = fastq_screen.fastq_screen( + fastq_screen_f, config_dict["fastq_screen"], fastq_screen_outfile + ) + else: + LOGGER.info("Skipping fastq_screen component as it was not supplied by the user") # write QC check results to file LOGGER.info("-" * 80) diff --git a/src/run_quac.py b/src/run_quac.py index f76c4ef3898c23bc83e5b9fb770cb1053a9d8c6c..9fe02f002277fd95c0bdcf1984f808f39ecd786f 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -21,7 +21,7 @@ from pathlib import Path import uuid import os.path import yaml -from utility_cgds.cgds.pipeline.src.submit_slurm_job import submit_slurm_job +from slurm.submit_slurm_job import submit_slurm_job def make_dir(d): @@ -97,17 +97,15 @@ def create_snakemake_command(args, repo_path, mount_paths): """ # slurm profile dir for snakemake to properly handle to cluster job fails - snakemake_profile_dir = ( - repo_path / "configs/snakemake_slurm_profile//{{cookiecutter.profile_name}}/" - ) + snakemake_profile_dir = repo_path / "src/slurm/slurm_profile" # use absolute path to run it from anywhere snakefile_path = repo_path / "workflow" / "Snakefile" # directory to use as tmp in singularity container # If not exist, singularity will complain - tmp_dir = os.path.expandvars("$USER_SCRATCH/tmp/quac/tmp") - make_dir(tmp_dir) + tmp_dir = args.tmp_dir + # make_dir(tmp_dir) quac_configs = { "project_name": args.project_name, @@ -119,6 +117,8 @@ def create_snakemake_command(args, repo_path, mount_paths): "out_dir": args.outdir, "log_dir": args.log_dir, "exome": args.exome, + "include_prior_qc_data": args.include_prior_qc, + "allow_sample_renaming": args.allow_sample_renaming, } quac_configs = " ".join([f"{k}='{v}'" for k, v in quac_configs.items()]) @@ -168,13 +168,7 @@ def main(args): snakemake_cmd = create_snakemake_command(args, repo_path, mount_paths) # put together pipeline command to be run - singularity_module = "Singularity/3.5.2-GCC-5.4.0-2.26" - pipeline_cmd = "\n".join( - [ - f"module load {singularity_module}", - " \\\n\t".join(snakemake_cmd), - ] - ) + pipeline_cmd = " \\\n\t".join(snakemake_cmd) print( f'{"#" * 40}\n' @@ -282,10 +276,28 @@ if __name__ == "__main__": type=lambda x: is_valid_dir(PARSER, x), metavar="", ) + TMPDIR_DEFAULT = "$USER_SCRATCH/tmp/quac/tmp" + WORKFLOW.add_argument( + "--tmp_dir", + help="Directory path to store temporary files created by the workflow", + default=TMPDIR_DEFAULT, + type=lambda x: is_valid_dir(PARSER, x), + metavar="", + ) WORKFLOW.add_argument( "--exome", action="store_true", - help="Flag to run in exome mode. WARNING: Please provide appropriate configs via --quac_watch_config.", + help="Flag to run the workflow in exome mode. WARNING: Please provide appropriate configs via --quac_watch_config.", + ) + WORKFLOW.add_argument( + "--include_prior_qc", + action="store_true", + help="Flag to additionally use prior QC data as input. See documentation for more info.", + ) + WORKFLOW.add_argument( + "--allow_sample_renaming", + action="store_true", + help="Flag to allow sample renaming in MultiQC report. See documentation for more info.", ) ############ Args for QuaC wrapper tool ############ diff --git a/src/slurm/slurm_profile/config.yaml b/src/slurm/slurm_profile/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5272e3e0fe6fe97be84f65b1262d81254b030d8d --- /dev/null +++ b/src/slurm/slurm_profile/config.yaml @@ -0,0 +1,6 @@ +cluster-status: "slurm-status.py" +keep-going: True +printshellcmds: True +jobs: 30 +max-jobs-per-second: 7 +max-status-checks-per-second: 7 diff --git a/src/slurm/slurm_profile/slurm-status.py b/src/slurm/slurm_profile/slurm-status.py new file mode 100755 index 0000000000000000000000000000000000000000..306a0d35c2ac4052bf692a45b4b5f479818e6f10 --- /dev/null +++ b/src/slurm/slurm_profile/slurm-status.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +""" +Modified from https://github.com/Snakemake-Profiles/slurm/blob/master/%7B%7Bcookiecutter.profile_name%7D%7D/slurm-status.py +""" + +import re +import subprocess as sp +import shlex +import sys +import time +import logging + +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid))) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid))) + m = re.search("JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +# Unclear whether SUSPENDED should be treated as running or failed +elif status == "SUSPENDED": + print("running") +else: + print("running") diff --git a/src/slurm/submit_slurm_job.py b/src/slurm/submit_slurm_job.py new file mode 100644 index 0000000000000000000000000000000000000000..87ad7feadf31d298a618cd95480af59784d07dea --- /dev/null +++ b/src/slurm/submit_slurm_job.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +""" +Submit a job to slurm using slurmpy +""" + +import datetime +from slurmpy import slurmpy + + +def submit_slurm_job(job_cmd, job_dict): + """ + Submit job command to slurm using slurmpy + + Arguments: + job_cmd (str) -- Job's complete command + job_dict (dict) -- Dictionary containing necessary job info + (resources, basename, etc) to pass to slurmpy. + Necessary keys: basename, log_dir, resources, run_locally + Optional keys: use_bash_strict, env_var_dict + + Returns: + slurm job ID + """ + + # basename and stuff needed for slurm job name. + # This will also be used for job script and log file names. + job_basename = job_dict['basename'] + log_dir = job_dict['log_dir'] + date_time = datetime.datetime.now().isoformat().replace(':', '.') + job_name = f"{job_basename}{date_time}" + bash_strict = True if 'use_bash_strict' not in job_dict else job_dict['use_bash_strict'] + + print(f'Slurm job name : "{job_name}"') + print(f'Slurm job script : "{log_dir}/{job_name}.sh"') + + # set up the slurm job to submit + slurm_job = slurmpy.Slurm(name=job_basename, + slurm_kwargs=job_dict['resources'], + scripts_dir=log_dir, + log_dir=log_dir, + date_in_name=False, + bash_strict=bash_strict + ) + + params_dict = { + "command": job_cmd, + "name_addition": date_time + } + + # check if any environment variables need to be set in the job script + if "env_var_dict" in job_dict.keys(): + params_dict["cmd_kwargs"] = job_dict["env_var_dict"] + + # check if requested to run as a local job instead of slurm job. Useful for testing. + if job_dict['run_locally']: + params_dict["_cmd"] = "bash" + + # submit the slurm job + job_id = slurm_job.run(**params_dict) + + return job_id diff --git a/src/utility_cgds b/src/utility_cgds deleted file mode 160000 index 5145e05d34d694a676524d02d78ee21eae8ce64e..0000000000000000000000000000000000000000 --- a/src/utility_cgds +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5145e05d34d694a676524d02d78ee21eae8ce64e diff --git a/workflow/Snakefile b/workflow/Snakefile index bbf354aecd150af323e5d9d6438028fe0c434c86..14ef777c3911ca7d11fe38412aaa21c845ccb18b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -3,8 +3,6 @@ QuaC Pipeline to perform QC on bams and vcfs. Works at project level. """ -import pandas as pd - WORKFLOW_PATH = Path(workflow.basedir).parent diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk index 5dac20e73a266af9e2919c25e5dac9a26584c952..8d9c8b96745f9ed21f4d9735ed2d8a25dc6505f0 100644 --- a/workflow/rules/aggregate_results.smk +++ b/workflow/rules/aggregate_results.smk @@ -16,7 +16,7 @@ rule create_multiqc_config: shell: r""" python {input.script} \ - --template_f {input.template} \ + --template_f {input.template} \ --qc_config {input.quac_watch_config} \ --outfile {output} """ @@ -25,7 +25,7 @@ rule create_multiqc_config: ########################## Single-sample-level QC aggregation ########################## rule multiqc_by_sample_initial_pass: input: - get_small_var_pipeline_targets, + get_small_var_pipeline_targets if INCLUDE_PRIOR_QC_DATA else [], OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt", OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html", OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}.alignment_summary_metrics", @@ -33,23 +33,23 @@ rule multiqc_by_sample_initial_pass: OUT_DIR / "{sample}" / "qc" / "verifyBamID" / "{sample}.Ancestry", OUT_DIR / "{sample}" / "qc" / "bcftools-stats" / "{sample}.bcftools.stats", multiqc_config=MULTIQC_CONFIG_FILE, - rename_config=PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv", + rename_config=PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv" if ALLOW_SAMPLE_RENAMING else [], output: protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc.html"), protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_general_stats.txt"), - protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastqc_trimmed.txt"), - protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastq_screen.txt"), + protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastqc_trimmed.txt") if INCLUDE_PRIOR_QC_DATA else [], + protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastq_screen.txt") if INCLUDE_PRIOR_QC_DATA else [], protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_AlignmentSummaryMetrics.txt"), protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_QualityYieldMetrics.txt"), protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_wgsmetrics.txt"), - protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_dups.txt"), + protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_dups.txt") if INCLUDE_PRIOR_QC_DATA else [], # WARNING: don't put this rule in a group, bad things will happen. see issue #23 in gitlab (small var caller pipeline repo) message: "Aggregates QC results using multiqc. First pass. Output will be used for the QuaC-Watch. Sample: {wildcards.sample}" params: # multiqc uses fastq's filenames to identify sample names. Rename them to in-house names, - # using custom rename config file - extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}", + # using custom rename config file, if needed + extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}" if ALLOW_SAMPLE_RENAMING else f"--config {input.multiqc_config}", conda: ### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ### str(WORKFLOW_PATH / "configs/env/multiqc.yaml") @@ -61,13 +61,13 @@ rule quac_watch: input: qc_config=config["quac_watch_config"], multiqc_stats=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_general_stats.txt", - fastqc_trimmed=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastqc_trimmed.txt", - fastq_screen=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastq_screen.txt", + fastqc_trimmed=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastqc_trimmed.txt" if INCLUDE_PRIOR_QC_DATA else [], + fastq_screen=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastq_screen.txt" if INCLUDE_PRIOR_QC_DATA else [], qualimap=OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "genome_results.txt", picard_asm=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_AlignmentSummaryMetrics.txt", picard_qym=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_QualityYieldMetrics.txt", picard_wgs=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_wgsmetrics.txt", - picard_dups=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_dups.txt", + picard_dups=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_picard_dups.txt" if INCLUDE_PRIOR_QC_DATA else [], bcftools_index=OUT_DIR / "{sample}" / "qc" / "bcftools-index" / "{sample}.bcftools.index.tsv", output: protected( @@ -75,18 +75,21 @@ rule quac_watch: OUT_DIR / "{{sample}}" / "qc" / "quac_watch" / "quac_watch_{suffix}.yaml", suffix=[ "overall_summary", - "fastqc", - "fastq_screen", "qualimap_overall", "qualimap_chromosome_stats", "picard", - "picard_dups", "verifybamid", "bcftools_stats", "variant_per_contig", ], ) ), + protected( + expand( + OUT_DIR / "{{sample}}" / "qc" / "quac_watch" / "quac_watch_{suffix}.yaml", + suffix=[ "fastqc", "fastq_screen", "picard_dups"], + ) + ) if INCLUDE_PRIOR_QC_DATA else [], # WARNING: don't put this rule in a group, bad things will happen. see issue #23 in gitlab message: "Runs QuaC-Watch on various QC tool output, based on custom defined QC thresholds. " @@ -94,6 +97,7 @@ rule quac_watch: params: sample="{sample}", outdir=lambda wildcards, output: str(Path(output[0]).parent), + extra=lambda wildcards, input: f'--fastqc "{input.fastqc_trimmed}" --fastq_screen "{input.fastq_screen}" --picard_dups "{input.picard_dups}"' if INCLUDE_PRIOR_QC_DATA else "", conda: str(WORKFLOW_PATH / "configs/env/quac_watch.yaml") shell: @@ -101,14 +105,12 @@ rule quac_watch: python src/quac_watch/quac_watch.py \ --config {input.qc_config} \ --multiqc_stats {input.multiqc_stats} \ - --fastqc {input.fastqc_trimmed} \ - --fastq_screen {input.fastq_screen} \ --qualimap {input.qualimap} \ --picard_asm {input.picard_asm} \ --picard_qym {input.picard_qym} \ --picard_wgs {input.picard_wgs} \ - --picard_dups {input.picard_dups} \ --bcftools_index {input.bcftools_index} \ + {params.extra} \ --sample {params.sample} \ --outdir {params.outdir} """ @@ -116,7 +118,7 @@ rule quac_watch: rule multiqc_by_sample_final_pass: input: - get_small_var_pipeline_targets, + get_small_var_pipeline_targets if INCLUDE_PRIOR_QC_DATA else [], OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt", OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html", OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}.alignment_summary_metrics", @@ -125,7 +127,7 @@ rule multiqc_by_sample_final_pass: OUT_DIR / "{sample}" / "qc" / "bcftools-stats" / "{sample}.bcftools.stats", OUT_DIR / "{sample}" / "qc" / "quac_watch" / "quac_watch_overall_summary.yaml", multiqc_config=MULTIQC_CONFIG_FILE, - rename_config=PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv", + rename_config=PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv" if ALLOW_SAMPLE_RENAMING else [], output: protected(OUT_DIR / "{sample}" / "qc" / "multiqc_final_pass" / "{sample}_multiqc.html"), protected(OUT_DIR / "{sample}" / "qc" / "multiqc_final_pass" / "{sample}_multiqc_data" / "multiqc_general_stats.txt"), @@ -134,8 +136,8 @@ rule multiqc_by_sample_final_pass: "Aggregates QC results using multiqc. Final pass, where QuaC-Watch results are also aggregated. Sample: {wildcards.sample}" params: # multiqc uses fastq's filenames to identify sample names. Rename them to in-house names, - # using custom rename config file - extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}", + # using custom rename config file, if needed + extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}" if ALLOW_SAMPLE_RENAMING else f"--config {input.multiqc_config}", conda: ### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ### str(WORKFLOW_PATH / "configs/env/multiqc.yaml") @@ -148,7 +150,6 @@ rule multiqc_by_sample_final_pass: localrules: aggregate_sample_rename_configs, - rule aggregate_sample_rename_configs: input: expand( @@ -171,6 +172,13 @@ rule multiqc_aggregation_all_samples: PROJECT_PATH / "{sample}" / "qc" / "fastqc-trimmed" / "{sample}-{unit}-{read}_fastqc.zip", PROJECT_PATH / "{sample}" / "qc" / "fastq_screen-trimmed" / "{sample}-{unit}-{read}_screen.txt", PROJECT_PATH / "{sample}" / "qc" / "dedup" / "{sample}-{unit}.metrics.txt", + ], + sample=SAMPLES, + unit=[1], + read=["R1", "R2"], + ) if INCLUDE_PRIOR_QC_DATA else [], + expand( + [ OUT_DIR / "project_level_qc" / "somalier" / "relatedness" / "somalier.html", OUT_DIR / "project_level_qc" / "somalier" / "ancestry" / "somalier.somalier-ancestry.html", OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt", @@ -186,18 +194,21 @@ rule multiqc_aggregation_all_samples: read=["R1", "R2"], ), multiqc_config=MULTIQC_CONFIG_FILE, - rename_config=OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "aggregated_rename_configs.tsv", + rename_config=OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "aggregated_rename_configs.tsv" if ALLOW_SAMPLE_RENAMING else [], output: protected(OUT_DIR / "project_level_qc" / "multiqc" / "multiqc_report.html"), message: "Running multiqc for all samples" params: # multiqc uses fastq's filenames to identify sample names. Rename them to in-house names, - # using custom rename config file + # using custom rename config file, if needed extra=( lambda wildcards, input: f'--config {input.multiqc_config} \ --sample-names {input.rename_config} \ - --cl_config "max_table_rows: 2000"' + --cl_config "max_table_rows: 2000"' \ + if ALLOW_SAMPLE_RENAMING else \ + f'--config {input.multiqc_config} \ + --cl_config "max_table_rows: 2000"' ), conda: ### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ### diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 09186a1c8d1adbf684098f502d614e044b038e4c..bf2205610751b9c6359046b3e4799ff0d5c25cb4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,4 +1,5 @@ import re +from pathlib import PurePath from snakemake.logging import logger @@ -20,7 +21,9 @@ def get_samples(ped_fpath): def is_testing_mode(): "checks if testing dataset is used as input for the pipeline" - if ".test/" in str(PROJECT_PATH): + query = ".test" + if query in PurePath(PROJECT_PATH).parts: + logger.info(f"// WARNING: '{query}' present in the path supplied via --projects_path. So testing mode is used.") return True return None @@ -87,7 +90,7 @@ def aggregate_rename_configs(rename_config_files, outfile): if not line_no: if line != "Original labels\tRenamed labels": - print(f"Unexpected header string in file '{fpath}'") + logger.error(f"Unexpected header string in file '{fpath}'") raise SystemExit(1) else: aggregated_data.append(line) @@ -104,6 +107,8 @@ PROJECT_NAME = config["project_name"] PROJECT_PATH = Path(config["projects_path"]) / PROJECT_NAME / "analysis" PEDIGREE_FPATH = config["ped"] EXOME_MODE = config["exome"] +ALLOW_SAMPLE_RENAMING = config["allow_sample_renaming"] +INCLUDE_PRIOR_QC_DATA = config["include_prior_qc_data"] #### configs from configfile #### RULE_LOGS_PATH = Path(config["log_dir"]) / "rule_logs" @@ -115,3 +120,4 @@ MULTIQC_CONFIG_FILE = OUT_DIR / "project_level_qc" / "multiqc" / "configs" / f"t logger.info(f"// Processing project: {PROJECT_NAME}") logger.info(f'// Project path: "{PROJECT_PATH}"') logger.info(f"// Exome mode: {EXOME_MODE}") +logger.info(f"// Include prior QC data: {INCLUDE_PRIOR_QC_DATA}") diff --git a/workflow/rules/coverage_analysis.smk b/workflow/rules/coverage_analysis.smk index 1b00f77b2e60d442b948baaccc3d18eaa77c716f..212bf859327ffe5e9b3aba5c255e0feb11ce79be 100644 --- a/workflow/rules/coverage_analysis.smk +++ b/workflow/rules/coverage_analysis.smk @@ -4,6 +4,9 @@ rule samtools_stats: PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", output: protected(OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt"), + conda: + ### see issue #49 on why local conda env is used to sidestep snakemake-wrapper's ### + str(WORKFLOW_PATH / "configs/env/samtools.yaml") message: "stats bam using samtools. Sample: {wildcards.sample}" wrapper: @@ -56,6 +59,9 @@ rule picard_collect_multiple_metrics: ".alignment_summary_metrics", ".quality_yield_metrics", ), + conda: + ### see issue #49 on why local conda env is used to sidestep snakemake-wrapper's ### + str(WORKFLOW_PATH / "configs/env/picard_smk.yaml") message: "stats bam using Picard's collectmultiplemetrics. Sample: {wildcards.sample}" params: @@ -174,8 +180,8 @@ rule covviz: "Running covviz" log: OUT_DIR / "project_level_qc" / "covviz" / "stdout.log", - conda: - str(WORKFLOW_PATH / "configs/env/covviz.yaml") + singularity: + "docker://brwnj/covviz:v1.2.2" shell: r""" covviz \