From ae99f76b0b9ec659948a10af2b55853a0a6e481b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 16 Sep 2024 11:38:09 +0200 Subject: [PATCH 001/178] reformat the script --- job.sh | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/job.sh b/job.sh index 1a5b57e..a4280ff 100644 --- a/job.sh +++ b/job.sh @@ -3,7 +3,7 @@ ### prepare_calling_jobs #SBATCH -J smk_main ### Max run time "hours:minutes:seconds" -#SBATCH --time=120:00:00 +#SBATCH --time=96:00:00 #SBATCH --ntasks=1 #nb of processes #SBATCH --cpus-per-task=1 # nb of cores for each process(1 process) #SBATCH --mem=10G # max of memory (-m) @@ -14,7 +14,7 @@ #SBATCH -o slurm_logs/snakemake.%N.%j.out #SBATCH -e slurm_logs/snakemake.%N.%j.err #SBATCH --mail-type=END,FAIL -#SBATCH --mail-user=sukanya.denni@univ-rouen.fr +#SBATCH --mail-user=lucien.piat@inare.fr ################################################################################ # Useful information to print @@ -35,38 +35,29 @@ echo 'scontrol show job:' scontrol show job $SLURM_JOB_ID echo '########################################' -## get SNG_BIND abs path using python -function SNG_BIND_ABS_PATH { - SNG_BIND="$(python3 - <<END -import os - -abs_path = os.getcwd() -print(abs_path) - -END -)" -} -SNG_BIND_ABS_PATH +# relocate the modules and load python +module purge +module load python/3.9.7 +module load snakemake/6.5.1 ### variables +SNG_BIND="/mnt/cbib/pangenoak_trials/GenomAsm4pg/" CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" MAX_CORES=10 PROFILE=".config/snakemake_profile/slurm" -### Module Loading: -module purge -module load snakemake/6.5.1 - echo 'Starting Snakemake workflow' - ### Snakemake commands - if [ "$1" = "dry" ] then # dry run snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -n -r -else +elif [ -z "$1" ] +then # run snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG +else + echo "Error: Invalid argument. Use 'dry' or no argument." >&2 + exit 1 fi \ No newline at end of file -- GitLab From e7652c2e96c491d14e882d3eebd34079752695a5 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 16 Sep 2024 12:00:01 +0200 Subject: [PATCH 002/178] add a module loading function --- job.sh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/job.sh b/job.sh index a4280ff..8438271 100644 --- a/job.sh +++ b/job.sh @@ -35,10 +35,18 @@ echo 'scontrol show job:' scontrol show job $SLURM_JOB_ID echo '########################################' -# relocate the modules and load python -module purge -module load python/3.9.7 -module load snakemake/6.5.1 +# Function to load modules +load_modules() { + module purge # Clear any previously loaded modules + + # Loop through each module and load it + for module_name in "$@"; do + module load "$module_name" + done +} + +# Here specify the modules to load and their path +load_modules "python/3.9.7" "snakemake/6.5.1" ### variables SNG_BIND="/mnt/cbib/pangenoak_trials/GenomAsm4pg/" -- GitLab From 049e5e6e0698bccd977aebb4902ef348d35c3580 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 14:32:05 +0200 Subject: [PATCH 003/178] fix time limit on cbib --- .config/snakemake_profile/slurm/cluster_config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.config/snakemake_profile/slurm/cluster_config.yml b/.config/snakemake_profile/slurm/cluster_config.yml index 9a19ab9..db27b01 100644 --- a/.config/snakemake_profile/slurm/cluster_config.yml +++ b/.config/snakemake_profile/slurm/cluster_config.yml @@ -1,7 +1,7 @@ ### default ressources used by snakemake (applied to all rules) __default__: job-name: "{rule}" - time: "120:00:00" # max run time "hours:minutes:seconds" + time: "96:00:00" # max run time "hours:minutes:seconds" ntasks: 1 # nb of processes cpus-per-task: 4 # nb of cores for each process(1 process) mem: "60G" @@ -10,7 +10,7 @@ __default__: output: "slurm_logs/{rule}.%N.%j.out" error: "slurm_logs/{rule}.%N.%j.err" mail-type: END,FAIL #email notification - mail-user: sukanya.denni@univ-rouen.fr + mail-user: lucien.piat@inrae.fr ### rule resources # convert with seqtk @@ -58,4 +58,4 @@ merqury_trio: cpus-per-task: 20 purge_merqury_trio: - cpus-per-task: 20 \ No newline at end of file + cpus-per-task: 20 -- GitLab From c537c6940b5b6210cafc0d3b07fda58df2a5b0b4 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 15:06:07 +0200 Subject: [PATCH 004/178] start km size variable creation --- workflow/rules/03_asm_qc.smk | 5 +++-- workflow/rules/05_purged_asm_qc.smk | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/workflow/rules/03_asm_qc.smk b/workflow/rules/03_asm_qc.smk index f32a480..8c37ac9 100644 --- a/workflow/rules/03_asm_qc.smk +++ b/workflow/rules/03_asm_qc.smk @@ -48,12 +48,13 @@ rule kat: res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/katplot/hap{n}/{id}_hap{n}.katplot.png" params: prefix="{id}_hap{n}", - path=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/katplot/hap{n}/{id}_hap{n}" + path=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/katplot/hap{n}/{id}_hap{n}", + km_size = config["km_size"] threads: 4 container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/kat2.4.1" shell: - "kat comp -o {params.path} -t {threads} -m 21 --output_type png -v {input.jellyfish} {input.hap} && " + "kat comp -o {params.path} -t {threads} -m {params.km_size} --output_type png -v {input.jellyfish} {input.hap} && " "kat plot spectra-cn -x 200 -o {params.path}.katplot.png {params.path}-main.mx" # telomeres diff --git a/workflow/rules/05_purged_asm_qc.smk b/workflow/rules/05_purged_asm_qc.smk index d7a79ff..708f538 100644 --- a/workflow/rules/05_purged_asm_qc.smk +++ b/workflow/rules/05_purged_asm_qc.smk @@ -30,8 +30,9 @@ use rule kat as purge_kat with: output: res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/katplot/hap{n}/{id}_purged_hap{n}.katplot.png" params: + km_size = config["km_size"], prefix="{id}_hap{n}", - path= res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/katplot//hap{n}/{id}_purged_hap{n}" + path= res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/katplot/hap{n}/{id}_purged_hap{n}" rule purge_find_telomeres: input: -- GitLab From 91d839edb8b483439da6193f1e82ab4fd09f263f Mon Sep 17 00:00:00 2001 From: PIAT LUCIEN <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 15:47:12 +0200 Subject: [PATCH 005/178] Update .gitlab-ci.yml file --- .gitlab-ci.yml | 38 ++++++++------------------------------ 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a739b59..dbd9222 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,33 +1,11 @@ -# requiring the environment of NodeJS LTS -image: node:lts - -# add 'node_modules' to cache for speeding up builds -cache: - paths: - - node_modules/ # Node modules and dependencies - -before_script: - - npm init --yes - - npm install honkit --save-dev +#Test file for CICD +build-job: + stage: build + script: + - echo "Starting the CICD testing" -test: +test_job: stage: test script: - - npx honkit build . public # build to public path - only: - - branches # this job will affect every branch except 'main' - except: - - main - -# the 'pages' job will deploy and build your site to the 'public' path -pages: - stage: deploy - script: - - npx honkit build . public # build to public path - - cp -r workflow/doc/fig public/workflow/doc/ # fix missing images asset not copied to public - artifacts: - paths: - - public - expire_in: 1 week - only: - - main # this job will affect only the 'main' branch + - echo "This job will check if job.sh is running smoothly" + \ No newline at end of file -- GitLab From b18142bc9097c49e05bb5445c5352a1960a6f4e9 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 16:22:57 +0200 Subject: [PATCH 006/178] add a scafold for prejob testing --- .gitlab-ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dbd9222..ef81092 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,6 +4,11 @@ build-job: script: - echo "Starting the CICD testing" +test_prejob: + stage: test + script: + - echo "This job will check if job.sh is running smoothly + test_job: stage: test script: -- GitLab From 10f63191b25586580eeb5aef9dd678374d4aa673 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 16:23:41 +0200 Subject: [PATCH 007/178] improve slurm_log directory check --- job.sh | 3 +++ prejob.sh | 35 +++++++++++++++-------------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/job.sh b/job.sh index 8438271..800a31c 100644 --- a/job.sh +++ b/job.sh @@ -48,6 +48,9 @@ load_modules() { # Here specify the modules to load and their path load_modules "python/3.9.7" "snakemake/6.5.1" +# Create a directory for slurm logs if it is absent +[ -d "slurm_logs" ] || mkdir -p "slurm_logs" + ### variables SNG_BIND="/mnt/cbib/pangenoak_trials/GenomAsm4pg/" CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" diff --git a/prejob.sh b/prejob.sh index ec05113..0c1da86 100644 --- a/prejob.sh +++ b/prejob.sh @@ -3,7 +3,7 @@ ### prepare_calling_jobs #SBATCH -J smk_prejob ### Max run time "hours:minutes:seconds" -#SBATCH --time=120:00:00 +#SBATCH --time=96:00:00 #SBATCH --ntasks=1 #nb of processes #SBATCH --cpus-per-task=1 # nb of cores for each process(1 process) #SBATCH --mem=10G # max of memory (-m) @@ -14,7 +14,7 @@ #SBATCH -o slurm_logs/snakemake_prejob.%N.%j.out #SBATCH -e slurm_logs/snakemake_prejob.%N.%j.err #SBATCH --mail-type=END,FAIL -#SBATCH --mail-user=sukanya.denni@univ-rouen.fr +#SBATCH --mail-user=lucien.piat@inrae.fr ################################################################################ # Useful information to print @@ -36,33 +36,28 @@ scontrol show job $SLURM_JOB_ID echo '########################################' -## get SNG_BIND abs path using python -function SNG_BIND_ABS_PATH { - SNG_BIND="$(python3 - <<END -import os +# Function to load modules +load_modules() { + module purge # Clear any previously loaded modules -abs_path = os.getcwd() -print(abs_path) - -END -)" + # Loop through each module and load it + for module_name in "$@"; do + module load "$module_name" + done } -SNG_BIND_ABS_PATH +load_modules "python/3.9.7" "snakemake/6.5.1" -### variables +### variable +SNG_BIND="/mnt/cbib/pangenoak_trials/GenomeAsm4pg/" CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" MAX_CORES=4 PROFILE=".config/snakemake_profile/slurm" SMK_PATH="workflow/pre-job_snakefiles" -### Module Loading: -module purge -module load snakemake/6.5.1 - echo 'Starting Snakemake - data preparation' -### create a log directory for slurm logs -mkdir -p slurm_logs +# Create a directory for slurm logs if it is absent +[ -d "slurm_logs" ] || mkdir -p "slurm_logs" ### Snakemake commands # extract data @@ -72,4 +67,4 @@ snakemake -s $SMK_PATH/Snakefile1.smk --profile $PROFILE -j $MAX_CORES --cluster # smrtlink on bam data snakemake -s $SMK_PATH/Snakefile2.smk --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG # convert fastq to fasta when necessary -snakemake -s $SMK_PATH/Snakefile3.smk --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG \ No newline at end of file +snakemake -s $SMK_PATH/Snakefile3.smk --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -- GitLab From 0689b883a26db735f7dd5789e445203e2485f4f4 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 16:26:55 +0200 Subject: [PATCH 008/178] remove slurm log tracking to avoid errors for denovo users --- .gitignore | 1 - slurm_logs/.gitignore | 6 ------ 2 files changed, 7 deletions(-) delete mode 100644 slurm_logs/.gitignore diff --git a/.gitignore b/.gitignore index 24af5b9..2f4d28f 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ !Snakefile !*.smk !*.svg -!slurm_logs/ # 3) add a pattern to track the file patterns of section2 even if they are in diff --git a/slurm_logs/.gitignore b/slurm_logs/.gitignore deleted file mode 100644 index 5d59d6d..0000000 --- a/slurm_logs/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -### This file is here to avoid a SLURM error when running a job because of the missing slurm_logs directory - -# Ignore everything in this directory -* -# Except this file -!.gitignore \ No newline at end of file -- GitLab From 5a9d1523c484e0dea93e7d430e4a1c39d7f5d618 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 16:28:29 +0200 Subject: [PATCH 009/178] add TOC to readme to remove clutter --- README.md | 21 +++++++++++++++++++-- SUMMARY.md | 20 -------------------- 2 files changed, 19 insertions(+), 22 deletions(-) delete mode 100644 SUMMARY.md diff --git a/README.md b/README.md index 823d889..e8dd7a8 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,27 @@ doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/GenomAsm4pg/)  ## Table of contents -[TOC] +# Summary -## Repo directory structure +* [Introduction](README.md) +* [Documentation summary](workflow/documentation.md) + * [Requirements](workflow/documentation.md#asm4pg-requirements) + * [Tutorials](workflow/documentation.md#tutorials) + * [Quick start](workflow/doc/Quick-start.md) + * [Hi-C mode](workflow/doc/Assembly-Mode/Hi-C-tutorial.md) + * [Trio mode](workflow/doc/Assembly-Mode/Trio-tutorial.md) + * [Outputs](workflow/documentation.md#outputs) + * [Workflow output](workflow/doc/Outputs.md) + * [Optional data preparation](workflow/documentation.md#optional-data-preparation) + * [if your data is in a tarball archive](workflow/doc/Tar-data-preparation.md) + * [Going further](workflow/doc/Going-further.md) + * [Troubleshooting](workflow/documentation.md#known-errors) + * [known errors](workflow/doc/Known-errors.md) + * [Software Dependencies](workflow/documentation.md#programs) + * [Programs listing](workflow/doc/Programs.md) +* [Gitlab pages using honkit](honkit.md) +## Repo directory structure ``` ├── README.md diff --git a/SUMMARY.md b/SUMMARY.md deleted file mode 100644 index 5972000..0000000 --- a/SUMMARY.md +++ /dev/null @@ -1,20 +0,0 @@ -# Summary - -* [Introduction](README.md) -* [Documentation summary](workflow/documentation.md) - * [Requirements](workflow/documentation.md#asm4pg-requirements) - * [Tutorials](workflow/documentation.md#tutorials) - * [Quick start](workflow/doc/Quick-start.md) - * [Hi-C mode](workflow/doc/Assembly-Mode/Hi-C-tutorial.md) - * [Trio mode](workflow/doc/Assembly-Mode/Trio-tutorial.md) - * [Outputs](workflow/documentation.md#outputs) - * [Workflow output](workflow/doc/Outputs.md) - * [Optional data preparation](workflow/documentation.md#optional-data-preparation) - * [if your data is in a tarball archive](workflow/doc/Tar-data-preparation.md) - * [Going further](workflow/doc/Going-further.md) - * [Troubleshooting](workflow/documentation.md#known-errors) - * [known errors](workflow/doc/Known-errors.md) - * [Software Dependencies](workflow/documentation.md#programs) - * [Programs listing](workflow/doc/Programs.md) -* [Gitlab pages using honkit](honkit.md) - -- GitLab From 1c4cf06bd6e484f6743a988c31291124d730a279 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 16:39:05 +0200 Subject: [PATCH 010/178] add lost " --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ef81092..8acf0db 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,7 +7,7 @@ build-job: test_prejob: stage: test script: - - echo "This job will check if job.sh is running smoothly + - echo "This job will check if job.sh is running smoothly" test_job: stage: test -- GitLab From b9b78c29336d050293fb6a2b9066393b02036cfe Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 17:53:28 +0200 Subject: [PATCH 011/178] added dependencies for testing --- .gitlab-ci.yml | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8acf0db..2c50fb4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,16 +1,29 @@ -#Test file for CICD -build-job: - stage: build - script: - - echo "Starting the CICD testing" +stages: + - setup + - run -test_prejob: - stage: test +# Job to set up the environment and install dependencies +setup_environment: + stage: setup + image: continuumio/miniconda3 script: - - echo "This job will check if job.sh is running smoothly" + - conda install -c conda-forge mamba # Install Mamba + - mamba create -n snakemake -c conda-forge -c bioconda snakemake # Create environment and install Snakemake + - echo "source activate snakemake" > ~/.bashrc # Ensure the environment is activated + - source activate snakemake + - snakemake --version + artifacts: + paths: + - .conda/ + - .snakemake/ test_job: stage: test + image: continuumio/miniconda3 script: - - echo "This job will check if job.sh is running smoothly" - \ No newline at end of file + - git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git + - touch test_data.fasta.gz + - cd GenomAsm4pg + - source activate snakemake + dependencies: + - setup_environment \ No newline at end of file -- GitLab From c07d9a8ab52f3f72a2d1c70bac545690ff84f247 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 17:54:18 +0200 Subject: [PATCH 012/178] correct syntax --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2c50fb4..45c0476 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,6 @@ stages: - setup - - run + - test # Job to set up the environment and install dependencies setup_environment: -- GitLab From 09fd8b0d2b79743090dc06940ed9048b11dd9dac Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 17 Sep 2024 17:56:15 +0200 Subject: [PATCH 013/178] remove unused file --- honkit.md | 176 ------------------------------------------------------ 1 file changed, 176 deletions(-) delete mode 100644 honkit.md diff --git a/honkit.md b/honkit.md deleted file mode 100644 index 70fc45a..0000000 --- a/honkit.md +++ /dev/null @@ -1,176 +0,0 @@ -# HonKit - -HonKit is building beautiful books using GitHub/Git and Markdown. - - - -## Documentation and Demo - -HonKit documentation is built by HonKit! - -- <https://honkit.netlify.app/> - -## Quick Start - -### Installation - -- Requirement: [Node.js](https://nodejs.org) [LTS](https://nodejs.org/about/releases/) version - -The best way to install HonKit is via **NPM** or **Yarn**. - -``` -$ npm init --yes -$ npm install honkit --save-dev -``` - -âš ï¸ Warning: - -- If you have installed `honkit` globally, you must install each plugins globally as well -- If you have installed `honkit` locally, you must install each plugins locally as well - -We recommend installing `honkit` locally. - -### Create a book - -HonKit can set up a boilerplate book: - -``` -$ npx honkit init -``` - -If you wish to create the book into a new directory, you can do so by running `honkit init ./directory` - -Preview and serve your book using: - -``` -$ npx honkit serve -``` - -Or build the static website using: - -``` -$ npx honkit build -``` - -You can start to write your book! - -For more details, see [HonKit's documentation](https://honkit.netlify.app/). - -## Docker support - -Honkit provide docker image at [honkit/honkit](https://hub.docker.com/r/honkit/honkit). - -This docker image includes built-in dependencies for PDF/epub. - -``` -docker pull honkit/honkit -docker run -v `pwd`:`pwd` -w `pwd` --rm -it honkit/honkit honkit build -docker run -v `pwd`:`pwd` -w `pwd` --rm -it honkit/honkit honkit pdf -``` - -For more details, see [docker/](./docker/). - -## Usage examples - -HonKit can be used to create a book, public documentation, enterprise manual, thesis, research papers, etc. - -You can find a list of [real-world examples](https://honkit.netlify.app/examples.html) in the documentation. - -## Features - -* Write using [Markdown](https://honkit.netlify.app/syntax/markdown.html) or [AsciiDoc](https://honkit.netlify.app/syntax/asciidoc.html) -* Output as a website or [ebook (pdf, epub, mobi)](https://honkit.netlify.app/ebook.html) -* [Multi-Languages](https://honkit.netlify.app/languages.html) -* [Lexicon / Glossary](https://honkit.netlify.app/lexicon.html) -* [Cover](https://honkit.netlify.app/ebook.html) -* [Variables and Templating](https://honkit.netlify.app/templating/) -* [Content References](https://honkit.netlify.app/templating/conrefs.html) -* [Plugins](https://honkit.netlify.app/plugins/) -* [Beautiful default theme](./packages/@honkit/theme-default) - -## Fork of GitBook - -HonKit is a fork of [GitBook (Legacy)](https://github.com/GitbookIO/gitbook). -[GitBook (Legacy)](https://github.com/GitbookIO/gitbook) is [deprecated](https://github.com/GitbookIO/gitbook/commit/6c6ef7f4af32a2977e44dd23d3feb6ebf28970f4) and an inactive project. - -HonKit aims to smooth the migration from GitBook (Legacy) to HonKit. - -### Compatibility with GitBook - -- Almost all plugins work without changes! -- Support `gitbook-plugin-*` packages - - You should install these plugins via npm or yarn - - `npm install gitbook-plugin-<example> --save-dev` - -### Differences with GitBook - -- Node.js 14+ supports -- Improve `build`/`serve` performance - - `honkit build`: use file cache by default - - `honkit serve`: 28.2s → 0.9s in [examples/benchmark](examples/benchmark) - - Also, support `--reload` flag for force refresh -- Improve plugin loading logic - - Reduce cost of finding `honkit-plugin-*` and `gitbook-plugin-*` - - Support `honkit-plugin-*` and `@scope/honkit-plugin-*` (GitBook does not support a scoped module) -- Remove `install` command - - Instead of it, just use `npm install` or `yarn install` -- Remove `global-npm` dependency - - You can use HonKit with another npm package manager like `yarn` -- Update dependencies - - Upgrade to nunjucks@2, highlight.js etc... - - It will reduce bugs -- TypeScript - - Rewritten by TypeScript -- Monorepo codebase - - Easy to maintain -- [Docker support](./docker) - -### Migration from GitBook - -Replace `gitbook-cli` with `honkit`. - -``` -npm uninstall gitbook-cli -npm install honkit --save-dev -``` - -Replace `gitbook` command with `honkit` command. - -```diff - "scripts": { -- "build": "gitbook build", -+ "build": "honkit build", -- "serve": "gitbook serve" -+ "serve": "honkit serve" - }, -``` - -After that, HonKit just works! - -Examples of migration: - -- [Add a Github action to deploy · DjangoGirls/tutorial](https://github.com/DjangoGirls/tutorial/pull/1666) -- [Migrate from GitBook to Honkit · swaroopch/byte-of-python](https://github.com/swaroopch/byte-of-python/pull/88) -- [replace Gitbook into Honkit · yamat47/97-things-every-programmer-should-know](https://github.com/yamat47/97-things-every-programmer-should-know/pull/2) -- [Migrate misp-book from GitBook to honkit](https://github.com/MISP/misp-book/pull/227) - -## Benchmarks - -`honkit build` benchmark: - -- <https://honkit.github.io/honkit/dev/bench/> - -## Licensing - -HonKit is licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE) for the full license text. - -HonKit is a fork of [GitBook (Legacy)](https://github.com/GitbookIO/gitbook). -GitBook is licensed under the Apache License, Version 2.0. - -Also, HonKit includes [bignerdranch/gitbook](https://github.com/bignerdranch/gitbook) works. - -## Sponsors - -<a href="https://www.netlify.com"> -<img src="https://www.netlify.com/img/global/badges/netlify-color-bg.svg" alt="Deploys by Netlify" /> -</a> -- GitLab From 9a3e66c56f8202137b40b54842d1f25931b37a16 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 18 Sep 2024 10:19:36 +0200 Subject: [PATCH 014/178] Remove directory creation to remove SLURM conflict --- job.sh | 3 --- prejob.sh | 3 --- 2 files changed, 6 deletions(-) diff --git a/job.sh b/job.sh index 800a31c..8438271 100644 --- a/job.sh +++ b/job.sh @@ -48,9 +48,6 @@ load_modules() { # Here specify the modules to load and their path load_modules "python/3.9.7" "snakemake/6.5.1" -# Create a directory for slurm logs if it is absent -[ -d "slurm_logs" ] || mkdir -p "slurm_logs" - ### variables SNG_BIND="/mnt/cbib/pangenoak_trials/GenomAsm4pg/" CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" diff --git a/prejob.sh b/prejob.sh index 0c1da86..16eece8 100644 --- a/prejob.sh +++ b/prejob.sh @@ -56,9 +56,6 @@ SMK_PATH="workflow/pre-job_snakefiles" echo 'Starting Snakemake - data preparation' -# Create a directory for slurm logs if it is absent -[ -d "slurm_logs" ] || mkdir -p "slurm_logs" - ### Snakemake commands # extract data snakemake -s $SMK_PATH/Snakefile1.smk --profile $PROFILE -j $MAX_CORES --cluster-config $CLUSTER_CONFIG -- GitLab From dadbf394869e993df6ab0d7be4087183d5a64e12 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 18 Sep 2024 10:20:09 +0200 Subject: [PATCH 015/178] add gitignore for slurm logs --- slurm_logs/.gitignore | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 slurm_logs/.gitignore diff --git a/slurm_logs/.gitignore b/slurm_logs/.gitignore new file mode 100644 index 0000000..9ef8e2c --- /dev/null +++ b/slurm_logs/.gitignore @@ -0,0 +1,5 @@ +### Slurm needs an output for the log files +# Ignore everything in this directory +* +# Except this file +!.gitignore \ No newline at end of file -- GitLab From 4a0fc0c8f9dc5f821ac0fa04f69f6af882c739fe Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 18 Sep 2024 10:21:05 +0200 Subject: [PATCH 016/178] Update the readme file so its up to date with the small QOL improvements --- .config/masterconfig.yaml | 16 +++-- .gitlab-ci.yml | 28 ++------ .../snakejob.genometools_on_raw_data.13.sh | 12 ++++ .snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh | 12 ++++ .../tmp.46vxigdb/snakejob.jellyfish.12.sh | 12 ++++ .snakemake/tmp.46vxigdb/snakejob.meryl.25.sh | 12 ++++ .../tmp.46vxigdb/snakejob.start_time.39.sh | 12 ++++ workflow/doc/Quick-start.md | 68 ++++++++----------- 8 files changed, 108 insertions(+), 64 deletions(-) create mode 100755 .snakemake/tmp.46vxigdb/snakejob.genometools_on_raw_data.13.sh create mode 100755 .snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh create mode 100755 .snakemake/tmp.46vxigdb/snakejob.jellyfish.12.sh create mode 100755 .snakemake/tmp.46vxigdb/snakejob.meryl.25.sh create mode 100755 .snakemake/tmp.46vxigdb/snakejob.start_time.39.sh diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 6f0f25a..82690e8 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,9 +1,9 @@ # absolute path to your desired output path -root: . +root: /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output ####################### optional prejob - data preparation ####################### # path to tar data -data: /path +data: /mnt/cbib/pangenoak_trials/GenomAsm4pg/ # list of tar names get_all_tar_filename: False tarIDS: "tar_filename" @@ -11,14 +11,22 @@ tarIDS: "tar_filename" ####################### job - workflow ####################### # number of threads used by pigz pigz_threads: 4 +# k-mers size (reduce for small datasets) +km_size: 21 #TODO ### CONFIG +IDS: ["toy_dataset_bug"] - +toy_dataset_bug: + fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz" + run: toy_test_run + ploidy: 2 + busco_lineage: arthropoda_odb10 + mode: default ####################### workflow output directories ####################### # results directory -resdir: workflow_results +resdir: results ### PREJOB # extracted input data diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 45c0476..806be8b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,29 +1,13 @@ -stages: - - setup - - test - -# Job to set up the environment and install dependencies -setup_environment: - stage: setup - image: continuumio/miniconda3 +test_job: + stage: test + image: continuumio/miniconda3 script: - conda install -c conda-forge mamba # Install Mamba - mamba create -n snakemake -c conda-forge -c bioconda snakemake # Create environment and install Snakemake - echo "source activate snakemake" > ~/.bashrc # Ensure the environment is activated - source activate snakemake - snakemake --version - artifacts: - paths: - - .conda/ - - .snakemake/ - -test_job: - stage: test - image: continuumio/miniconda3 - script: - - git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git - - touch test_data.fasta.gz - cd GenomAsm4pg - - source activate snakemake - dependencies: - - setup_environment \ No newline at end of file + - touch test_data.fasta.gz + - source activate snakemake + - echo "test over" \ No newline at end of file diff --git a/.snakemake/tmp.46vxigdb/snakejob.genometools_on_raw_data.13.sh b/.snakemake/tmp.46vxigdb/snakejob.genometools_on_raw_data.13.sh new file mode 100755 index 0000000..216e265 --- /dev/null +++ b/.snakemake/tmp.46vxigdb/snakejob.genometools_on_raw_data.13.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# properties = {"type": "single", "rule": "genometools_on_raw_data", "local": false, "input": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz"], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/03_genometools/toy_dataset_bug.RawStat.txt"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run", "id": "toy_dataset_bug"}, "params": {}, "log": [], "threads": 4, "resources": {"tmpdir": "/tmp"}, "jobid": 13, "cluster": {"job-name": "genometools_on_raw_data", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 4, "mem": "60G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/genometools_on_raw_data.%N.%j.out", "error": "slurm_logs/genometools_on_raw_data.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} + cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ +/module/apps/snakemake/5.8.1/bin/python \ +-m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/03_genometools/toy_dataset_bug.RawStat.txt --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ +--force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ +--wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb /mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz --latency-wait 60 \ + --attempt 1 --force-use-threads --scheduler greedy \ +--wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ + --allowed-rules genometools_on_raw_data --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ +--mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 + diff --git a/.snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh b/.snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh new file mode 100755 index 0000000..11ce99c --- /dev/null +++ b/.snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# properties = {"type": "single", "rule": "hifiasm", "local": false, "input": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz"], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/00_assembly/toy_dataset_bug.bp.hap1.p_ctg.gfa", "/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/00_assembly/toy_dataset_bug.bp.hap2.p_ctg.gfa"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run", "id": "toy_dataset_bug"}, "params": {"prefix": "/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/00_assembly/toy_dataset_bug"}, "log": [], "threads": 20, "resources": {"tmpdir": "/tmp", "mem_mb": 250000}, "jobid": 4, "cluster": {"job-name": "hifiasm", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 20, "mem": "250G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/hifiasm.%N.%j.out", "error": "slurm_logs/hifiasm.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} + cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ +/module/apps/snakemake/5.8.1/bin/python \ +-m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/00_assembly/toy_dataset_bug.bp.hap1.p_ctg.gfa --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ +--force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ +--wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb /mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz --latency-wait 60 \ + --attempt 1 --force-use-threads --scheduler greedy \ +--wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ + --allowed-rules hifiasm --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ +--mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 + diff --git a/.snakemake/tmp.46vxigdb/snakejob.jellyfish.12.sh b/.snakemake/tmp.46vxigdb/snakejob.jellyfish.12.sh new file mode 100755 index 0000000..e149b2d --- /dev/null +++ b/.snakemake/tmp.46vxigdb/snakejob.jellyfish.12.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# properties = {"type": "single", "rule": "jellyfish", "local": false, "input": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz"], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/04_kmer/toy_dataset_bug.jf", "/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/04_kmer/toy_dataset_bug.histo"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run", "id": "toy_dataset_bug"}, "params": {}, "log": [], "threads": 4, "resources": {"tmpdir": "/tmp", "mem_mb": 40000}, "jobid": 12, "cluster": {"job-name": "jellyfish", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 4, "mem": "60G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/jellyfish.%N.%j.out", "error": "slurm_logs/jellyfish.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} + cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ +/module/apps/snakemake/5.8.1/bin/python \ +-m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/04_kmer/toy_dataset_bug.histo --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ +--force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ +--wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb /mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz --latency-wait 60 \ + --attempt 1 --force-use-threads --scheduler greedy \ +--wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ + --allowed-rules jellyfish --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ +--mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 + diff --git a/.snakemake/tmp.46vxigdb/snakejob.meryl.25.sh b/.snakemake/tmp.46vxigdb/snakejob.meryl.25.sh new file mode 100755 index 0000000..718e4bd --- /dev/null +++ b/.snakemake/tmp.46vxigdb/snakejob.meryl.25.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# properties = {"type": "single", "rule": "meryl", "local": false, "input": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz"], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/toy_dataset_bug_reads-db_k21.meryl"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run", "id": "toy_dataset_bug"}, "params": {}, "log": [], "threads": 20, "resources": {"tmpdir": "/tmp", "mem_mb": 60000}, "jobid": 25, "cluster": {"job-name": "meryl", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 10, "mem": "60G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/meryl.%N.%j.out", "error": "slurm_logs/meryl.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} + cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ +/module/apps/snakemake/5.8.1/bin/python \ +-m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/toy_dataset_bug_reads-db_k21.meryl --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ +--force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ +--wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb /mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz --latency-wait 60 \ + --attempt 1 --force-use-threads --scheduler greedy \ +--wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ + --allowed-rules meryl --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ +--mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 + diff --git a/.snakemake/tmp.46vxigdb/snakejob.start_time.39.sh b/.snakemake/tmp.46vxigdb/snakejob.start_time.39.sh new file mode 100755 index 0000000..2668563 --- /dev/null +++ b/.snakemake/tmp.46vxigdb/snakejob.start_time.39.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# properties = {"type": "single", "rule": "start_time", "local": false, "input": [], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/runtime.txt"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run"}, "params": {}, "log": [], "threads": 1, "resources": {"tmpdir": "/tmp"}, "jobid": 39, "cluster": {"job-name": "start_time", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 4, "mem": "60G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/start_time.%N.%j.out", "error": "slurm_logs/start_time.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} + cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ +/module/apps/snakemake/5.8.1/bin/python \ +-m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/runtime.txt --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ +--force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ +--wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb --latency-wait 60 \ + --attempt 1 --force-use-threads --scheduler greedy \ +--wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ + --allowed-rules start_time --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ +--mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 + diff --git a/workflow/doc/Quick-start.md b/workflow/doc/Quick-start.md index 4db0da8..8e55591 100644 --- a/workflow/doc/Quick-start.md +++ b/workflow/doc/Quick-start.md @@ -1,38 +1,35 @@ # Quick start - This tutorial shows how to use the workflow with default assembly mode which takes PacBio Hifi data as input. [TOC] ## Clone repository ```bash -cd . git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git ``` - -## 1. Cluster profile setup +Clone the repository in your desired folder. +## 1. Set up the scripts +### In job.sh ```bash -cd GenomAsm4pg/.config/snakemake_profile +cd GenomAsm4pg/ +vim job.sh ``` -The current profile is made for SLURM. If you use it, change line 13 to your email address in the `cluster_config`.yml file. - -To run this workflow on another HPC, create another profile (https://github.com/Snakemake-Profiles) and add it in the `.config/snakemake_profile` directory. Change the `CLUSTER_CONFIG` and `PROFILE` variables in `job.sh` and `prejob.sh` scripts. - - -## 2. Config file -**TO-DO : add a toy fasta.** +Modify: +- Line 17: Set your email address. +- Line 53: Set the path to your dataset folder. +### In masterconfig.yaml ```bash -cd .. +vim .config/masterconfig.yaml ``` - -Modify `masterconfig.yaml`. Root refers to the path for the output data. +Modify +- Line 2: Set the path to your output folder. ```yaml # absolute path to your desired output path -root: ./GenomAsm4pg/tutorial_output +root: ./GenomAsm4pg/<your_output_folder> ``` - -The reads file is `toy_dataset.fasta`, its name is used as key in config. - +Modify +- Line 18: Add all your raw datasets in IDS. +- Line 20: Provide the parameters for all datasets. ```yaml ####################### job - workflow ####################### ### CONFIG @@ -42,36 +39,31 @@ toy_dataset: fasta: "./GenomAsm4pg/tutorial_data/toy_dataset.fasta" run: tutorial ploidy: 2 - busco_lineage: eudicots_odb10 + busco_lineage: eudicots_odb10 mode: default ``` - -## 3. Create slurm_logs directory +## 2. Addapt the scripts to your HPC ```bash -cd .. -mkdir slurm_logs +vim .config/snakemake_profile/slurm/cluster_config.yml ``` -SLURM logs for each rule will be in this directory, there are .out and .err files for the worklow (*snakemake.cortex**) and for each rules (*rulename.cortex**). - -## 4. Mail setup -Modify line 17 to your email address in `job.sh`. +The current profile is configured for SLURM. If you use SLURM, change line 13 to your email address. -## 5. Dry run -To check the config, first do a dry run of the workflow. +To run this workflow on another HPC, create a new profile (https://github.com/Snakemake-Profiles) and add it to the .config/snakemake_profile directory. Update the CLUSTER_CONFIG and PROFILE variables in the job.sh and prejob.sh scripts. +If your cluster doesn’t have Singularity enabled by default, add it to the list of modules to load in job.sh. +## 3. Dry run +To check the configuration, first perform a dry run of the workflow: ```bash sbatch job.sh dry ``` -## 6. Run -If the dry run is successful, check that the `SNG_BIND` variable in `job.sh` is the same as `root` variable in `masterconfig.yaml`. - -If Singularity is not in the HPC environment, add `module load singularity` under `module load snakemake/6.5.1`. - -You can run the workflow. - +You can consult the logs in the slurm_logs/ directory. +## 4. Run +If the dry run is successful, ensure that the SNG_BIND variable in job.sh matches the root variable in masterconfig.yaml. +Then, run the script: ```bash sbatch job.sh ``` - ## Other assembly modes If you want to use additional Hi-C data or parental data, follow the [Hi-C assembly mode tutorial](Assembly-Mode/Hi-C-tutorial.md) or the [Trio assembly mode tutorial](Assembly-Mode/Trio-tutorial.md). To go further with the workflow use go [here](Going-further.md). + +**TO-DO : add a toy fasta.** \ No newline at end of file -- GitLab From 1b281e6fc9c7aa65b57d8076cda9f1775b8554f5 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Wed, 18 Sep 2024 10:23:23 +0200 Subject: [PATCH 017/178] Revert "Update the readme file so its up to date with the small QOL improvements" This reverts commit 4a0fc0c8f9dc5f821ac0fa04f69f6af882c739fe. --- .config/masterconfig.yaml | 16 ++--- .gitlab-ci.yml | 28 ++++++-- .../snakejob.genometools_on_raw_data.13.sh | 12 ---- .snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh | 12 ---- .../tmp.46vxigdb/snakejob.jellyfish.12.sh | 12 ---- .snakemake/tmp.46vxigdb/snakejob.meryl.25.sh | 12 ---- .../tmp.46vxigdb/snakejob.start_time.39.sh | 12 ---- workflow/doc/Quick-start.md | 68 +++++++++++-------- 8 files changed, 64 insertions(+), 108 deletions(-) delete mode 100755 .snakemake/tmp.46vxigdb/snakejob.genometools_on_raw_data.13.sh delete mode 100755 .snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh delete mode 100755 .snakemake/tmp.46vxigdb/snakejob.jellyfish.12.sh delete mode 100755 .snakemake/tmp.46vxigdb/snakejob.meryl.25.sh delete mode 100755 .snakemake/tmp.46vxigdb/snakejob.start_time.39.sh diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 82690e8..6f0f25a 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,9 +1,9 @@ # absolute path to your desired output path -root: /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output +root: . ####################### optional prejob - data preparation ####################### # path to tar data -data: /mnt/cbib/pangenoak_trials/GenomAsm4pg/ +data: /path # list of tar names get_all_tar_filename: False tarIDS: "tar_filename" @@ -11,22 +11,14 @@ tarIDS: "tar_filename" ####################### job - workflow ####################### # number of threads used by pigz pigz_threads: 4 -# k-mers size (reduce for small datasets) -km_size: 21 #TODO ### CONFIG -IDS: ["toy_dataset_bug"] -toy_dataset_bug: - fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz" - run: toy_test_run - ploidy: 2 - busco_lineage: arthropoda_odb10 - mode: default + ####################### workflow output directories ####################### # results directory -resdir: results +resdir: workflow_results ### PREJOB # extracted input data diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 806be8b..45c0476 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,13 +1,29 @@ -test_job: - stage: test - image: continuumio/miniconda3 +stages: + - setup + - test + +# Job to set up the environment and install dependencies +setup_environment: + stage: setup + image: continuumio/miniconda3 script: - conda install -c conda-forge mamba # Install Mamba - mamba create -n snakemake -c conda-forge -c bioconda snakemake # Create environment and install Snakemake - echo "source activate snakemake" > ~/.bashrc # Ensure the environment is activated - source activate snakemake - snakemake --version - - cd GenomAsm4pg + artifacts: + paths: + - .conda/ + - .snakemake/ + +test_job: + stage: test + image: continuumio/miniconda3 + script: + - git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git - touch test_data.fasta.gz - - source activate snakemake - - echo "test over" \ No newline at end of file + - cd GenomAsm4pg + - source activate snakemake + dependencies: + - setup_environment \ No newline at end of file diff --git a/.snakemake/tmp.46vxigdb/snakejob.genometools_on_raw_data.13.sh b/.snakemake/tmp.46vxigdb/snakejob.genometools_on_raw_data.13.sh deleted file mode 100755 index 216e265..0000000 --- a/.snakemake/tmp.46vxigdb/snakejob.genometools_on_raw_data.13.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# properties = {"type": "single", "rule": "genometools_on_raw_data", "local": false, "input": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz"], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/03_genometools/toy_dataset_bug.RawStat.txt"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run", "id": "toy_dataset_bug"}, "params": {}, "log": [], "threads": 4, "resources": {"tmpdir": "/tmp"}, "jobid": 13, "cluster": {"job-name": "genometools_on_raw_data", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 4, "mem": "60G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/genometools_on_raw_data.%N.%j.out", "error": "slurm_logs/genometools_on_raw_data.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} - cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ -/module/apps/snakemake/5.8.1/bin/python \ --m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/03_genometools/toy_dataset_bug.RawStat.txt --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ ---force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ ---wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb /mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz --latency-wait 60 \ - --attempt 1 --force-use-threads --scheduler greedy \ ---wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ - --allowed-rules genometools_on_raw_data --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ ---mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 - diff --git a/.snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh b/.snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh deleted file mode 100755 index 11ce99c..0000000 --- a/.snakemake/tmp.46vxigdb/snakejob.hifiasm.4.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# properties = {"type": "single", "rule": "hifiasm", "local": false, "input": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz"], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/00_assembly/toy_dataset_bug.bp.hap1.p_ctg.gfa", "/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/00_assembly/toy_dataset_bug.bp.hap2.p_ctg.gfa"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run", "id": "toy_dataset_bug"}, "params": {"prefix": "/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/00_assembly/toy_dataset_bug"}, "log": [], "threads": 20, "resources": {"tmpdir": "/tmp", "mem_mb": 250000}, "jobid": 4, "cluster": {"job-name": "hifiasm", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 20, "mem": "250G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/hifiasm.%N.%j.out", "error": "slurm_logs/hifiasm.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} - cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ -/module/apps/snakemake/5.8.1/bin/python \ --m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/00_assembly/toy_dataset_bug.bp.hap1.p_ctg.gfa --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ ---force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ ---wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb /mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz --latency-wait 60 \ - --attempt 1 --force-use-threads --scheduler greedy \ ---wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ - --allowed-rules hifiasm --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ ---mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 - diff --git a/.snakemake/tmp.46vxigdb/snakejob.jellyfish.12.sh b/.snakemake/tmp.46vxigdb/snakejob.jellyfish.12.sh deleted file mode 100755 index e149b2d..0000000 --- a/.snakemake/tmp.46vxigdb/snakejob.jellyfish.12.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# properties = {"type": "single", "rule": "jellyfish", "local": false, "input": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz"], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/04_kmer/toy_dataset_bug.jf", "/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/04_kmer/toy_dataset_bug.histo"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run", "id": "toy_dataset_bug"}, "params": {}, "log": [], "threads": 4, "resources": {"tmpdir": "/tmp", "mem_mb": 40000}, "jobid": 12, "cluster": {"job-name": "jellyfish", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 4, "mem": "60G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/jellyfish.%N.%j.out", "error": "slurm_logs/jellyfish.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} - cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ -/module/apps/snakemake/5.8.1/bin/python \ --m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/01_raw_data_QC/04_kmer/toy_dataset_bug.histo --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ ---force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ ---wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb /mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz --latency-wait 60 \ - --attempt 1 --force-use-threads --scheduler greedy \ ---wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ - --allowed-rules jellyfish --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ ---mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 - diff --git a/.snakemake/tmp.46vxigdb/snakejob.meryl.25.sh b/.snakemake/tmp.46vxigdb/snakejob.meryl.25.sh deleted file mode 100755 index 718e4bd..0000000 --- a/.snakemake/tmp.46vxigdb/snakejob.meryl.25.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# properties = {"type": "single", "rule": "meryl", "local": false, "input": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz"], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/toy_dataset_bug_reads-db_k21.meryl"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run", "id": "toy_dataset_bug"}, "params": {}, "log": [], "threads": 20, "resources": {"tmpdir": "/tmp", "mem_mb": 60000}, "jobid": 25, "cluster": {"job-name": "meryl", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 10, "mem": "60G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/meryl.%N.%j.out", "error": "slurm_logs/meryl.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} - cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ -/module/apps/snakemake/5.8.1/bin/python \ --m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/toy_dataset_bug_reads-db_k21.meryl --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ ---force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ ---wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb /mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz --latency-wait 60 \ - --attempt 1 --force-use-threads --scheduler greedy \ ---wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ - --allowed-rules meryl --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ ---mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 - diff --git a/.snakemake/tmp.46vxigdb/snakejob.start_time.39.sh b/.snakemake/tmp.46vxigdb/snakejob.start_time.39.sh deleted file mode 100755 index 2668563..0000000 --- a/.snakemake/tmp.46vxigdb/snakejob.start_time.39.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# properties = {"type": "single", "rule": "start_time", "local": false, "input": [], "output": ["/mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/runtime.txt"], "wildcards": {"runid": "toy_dataset_bug/toy_test_run"}, "params": {}, "log": [], "threads": 1, "resources": {"tmpdir": "/tmp"}, "jobid": 39, "cluster": {"job-name": "start_time", "time": "96:00:00", "ntasks": 1, "cpus-per-task": 4, "mem": "60G", "nodes": 1, "ntasks-per-node": 1, "output": "slurm_logs/start_time.%N.%j.out", "error": "slurm_logs/start_time.%N.%j.err", "mail-type": "END,FAIL", "mail-user": "lucien.piat@inrae.fr"}} - cd /isilon/cbib/pangenoak_trials/GenomAsm4pg && \ -/module/apps/snakemake/5.8.1/bin/python \ --m snakemake /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output/results/toy_dataset_bug/toy_test_run/runtime.txt --snakefile /isilon/cbib/pangenoak_trials/GenomAsm4pg/workflow/Snakefile \ ---force --cores all --keep-target-files --keep-remote --max-inventory-time 0 \ ---wait-for-files /isilon/cbib/pangenoak_trials/GenomAsm4pg/.snakemake/tmp.46vxigdb --latency-wait 60 \ - --attempt 1 --force-use-threads --scheduler greedy \ ---wrapper-prefix https://github.com/snakemake/snakemake-wrappers/raw/ \ - --allowed-rules start_time --nocolor --notemp --no-hooks --nolock --scheduler-solver-path /module/apps/snakemake/5.8.1/bin \ ---mode 2 --use-singularity --singularity-args "-B /mnt/cbib/pangenoak_trials/GenomAsm4pg/" --default-resources "tmpdir=system_tmpdir" && exit 0 || exit 1 - diff --git a/workflow/doc/Quick-start.md b/workflow/doc/Quick-start.md index 8e55591..4db0da8 100644 --- a/workflow/doc/Quick-start.md +++ b/workflow/doc/Quick-start.md @@ -1,35 +1,38 @@ # Quick start + This tutorial shows how to use the workflow with default assembly mode which takes PacBio Hifi data as input. [TOC] ## Clone repository ```bash +cd . git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git ``` -Clone the repository in your desired folder. -## 1. Set up the scripts -### In job.sh + +## 1. Cluster profile setup ```bash -cd GenomAsm4pg/ -vim job.sh +cd GenomAsm4pg/.config/snakemake_profile ``` -Modify: -- Line 17: Set your email address. -- Line 53: Set the path to your dataset folder. -### In masterconfig.yaml +The current profile is made for SLURM. If you use it, change line 13 to your email address in the `cluster_config`.yml file. + +To run this workflow on another HPC, create another profile (https://github.com/Snakemake-Profiles) and add it in the `.config/snakemake_profile` directory. Change the `CLUSTER_CONFIG` and `PROFILE` variables in `job.sh` and `prejob.sh` scripts. + + +## 2. Config file +**TO-DO : add a toy fasta.** ```bash -vim .config/masterconfig.yaml +cd .. ``` -Modify -- Line 2: Set the path to your output folder. + +Modify `masterconfig.yaml`. Root refers to the path for the output data. ```yaml # absolute path to your desired output path -root: ./GenomAsm4pg/<your_output_folder> +root: ./GenomAsm4pg/tutorial_output ``` -Modify -- Line 18: Add all your raw datasets in IDS. -- Line 20: Provide the parameters for all datasets. + +The reads file is `toy_dataset.fasta`, its name is used as key in config. + ```yaml ####################### job - workflow ####################### ### CONFIG @@ -39,31 +42,36 @@ toy_dataset: fasta: "./GenomAsm4pg/tutorial_data/toy_dataset.fasta" run: tutorial ploidy: 2 - busco_lineage: eudicots_odb10 + busco_lineage: eudicots_odb10 mode: default ``` -## 2. Addapt the scripts to your HPC + +## 3. Create slurm_logs directory ```bash -vim .config/snakemake_profile/slurm/cluster_config.yml +cd .. +mkdir slurm_logs ``` -The current profile is configured for SLURM. If you use SLURM, change line 13 to your email address. +SLURM logs for each rule will be in this directory, there are .out and .err files for the worklow (*snakemake.cortex**) and for each rules (*rulename.cortex**). + +## 4. Mail setup +Modify line 17 to your email address in `job.sh`. -To run this workflow on another HPC, create a new profile (https://github.com/Snakemake-Profiles) and add it to the .config/snakemake_profile directory. Update the CLUSTER_CONFIG and PROFILE variables in the job.sh and prejob.sh scripts. +## 5. Dry run +To check the config, first do a dry run of the workflow. -If your cluster doesn’t have Singularity enabled by default, add it to the list of modules to load in job.sh. -## 3. Dry run -To check the configuration, first perform a dry run of the workflow: ```bash sbatch job.sh dry ``` -You can consult the logs in the slurm_logs/ directory. -## 4. Run -If the dry run is successful, ensure that the SNG_BIND variable in job.sh matches the root variable in masterconfig.yaml. -Then, run the script: +## 6. Run +If the dry run is successful, check that the `SNG_BIND` variable in `job.sh` is the same as `root` variable in `masterconfig.yaml`. + +If Singularity is not in the HPC environment, add `module load singularity` under `module load snakemake/6.5.1`. + +You can run the workflow. + ```bash sbatch job.sh ``` + ## Other assembly modes If you want to use additional Hi-C data or parental data, follow the [Hi-C assembly mode tutorial](Assembly-Mode/Hi-C-tutorial.md) or the [Trio assembly mode tutorial](Assembly-Mode/Trio-tutorial.md). To go further with the workflow use go [here](Going-further.md). - -**TO-DO : add a toy fasta.** \ No newline at end of file -- GitLab From f444f44fa4b9f392e1a9e44eb35c9914b10de27b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 18 Sep 2024 11:29:43 +0200 Subject: [PATCH 018/178] update so it include all the small QOL changes made in the scripts --- workflow/doc/Quick-start.md | 68 ++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/workflow/doc/Quick-start.md b/workflow/doc/Quick-start.md index 4db0da8..8e55591 100644 --- a/workflow/doc/Quick-start.md +++ b/workflow/doc/Quick-start.md @@ -1,38 +1,35 @@ # Quick start - This tutorial shows how to use the workflow with default assembly mode which takes PacBio Hifi data as input. [TOC] ## Clone repository ```bash -cd . git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git ``` - -## 1. Cluster profile setup +Clone the repository in your desired folder. +## 1. Set up the scripts +### In job.sh ```bash -cd GenomAsm4pg/.config/snakemake_profile +cd GenomAsm4pg/ +vim job.sh ``` -The current profile is made for SLURM. If you use it, change line 13 to your email address in the `cluster_config`.yml file. - -To run this workflow on another HPC, create another profile (https://github.com/Snakemake-Profiles) and add it in the `.config/snakemake_profile` directory. Change the `CLUSTER_CONFIG` and `PROFILE` variables in `job.sh` and `prejob.sh` scripts. - - -## 2. Config file -**TO-DO : add a toy fasta.** +Modify: +- Line 17: Set your email address. +- Line 53: Set the path to your dataset folder. +### In masterconfig.yaml ```bash -cd .. +vim .config/masterconfig.yaml ``` - -Modify `masterconfig.yaml`. Root refers to the path for the output data. +Modify +- Line 2: Set the path to your output folder. ```yaml # absolute path to your desired output path -root: ./GenomAsm4pg/tutorial_output +root: ./GenomAsm4pg/<your_output_folder> ``` - -The reads file is `toy_dataset.fasta`, its name is used as key in config. - +Modify +- Line 18: Add all your raw datasets in IDS. +- Line 20: Provide the parameters for all datasets. ```yaml ####################### job - workflow ####################### ### CONFIG @@ -42,36 +39,31 @@ toy_dataset: fasta: "./GenomAsm4pg/tutorial_data/toy_dataset.fasta" run: tutorial ploidy: 2 - busco_lineage: eudicots_odb10 + busco_lineage: eudicots_odb10 mode: default ``` - -## 3. Create slurm_logs directory +## 2. Addapt the scripts to your HPC ```bash -cd .. -mkdir slurm_logs +vim .config/snakemake_profile/slurm/cluster_config.yml ``` -SLURM logs for each rule will be in this directory, there are .out and .err files for the worklow (*snakemake.cortex**) and for each rules (*rulename.cortex**). - -## 4. Mail setup -Modify line 17 to your email address in `job.sh`. +The current profile is configured for SLURM. If you use SLURM, change line 13 to your email address. -## 5. Dry run -To check the config, first do a dry run of the workflow. +To run this workflow on another HPC, create a new profile (https://github.com/Snakemake-Profiles) and add it to the .config/snakemake_profile directory. Update the CLUSTER_CONFIG and PROFILE variables in the job.sh and prejob.sh scripts. +If your cluster doesn’t have Singularity enabled by default, add it to the list of modules to load in job.sh. +## 3. Dry run +To check the configuration, first perform a dry run of the workflow: ```bash sbatch job.sh dry ``` -## 6. Run -If the dry run is successful, check that the `SNG_BIND` variable in `job.sh` is the same as `root` variable in `masterconfig.yaml`. - -If Singularity is not in the HPC environment, add `module load singularity` under `module load snakemake/6.5.1`. - -You can run the workflow. - +You can consult the logs in the slurm_logs/ directory. +## 4. Run +If the dry run is successful, ensure that the SNG_BIND variable in job.sh matches the root variable in masterconfig.yaml. +Then, run the script: ```bash sbatch job.sh ``` - ## Other assembly modes If you want to use additional Hi-C data or parental data, follow the [Hi-C assembly mode tutorial](Assembly-Mode/Hi-C-tutorial.md) or the [Trio assembly mode tutorial](Assembly-Mode/Trio-tutorial.md). To go further with the workflow use go [here](Going-further.md). + +**TO-DO : add a toy fasta.** \ No newline at end of file -- GitLab From 8089688fb4e4e4d66eceb727f8020899f9294ca4 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 20 Sep 2024 11:30:23 +0200 Subject: [PATCH 019/178] Update hifiasm to v0.19.6 --- workflow/rules/02_asm.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/02_asm.smk b/workflow/rules/02_asm.smk index ee20083..c86b923 100644 --- a/workflow/rules/02_asm.smk +++ b/workflow/rules/02_asm.smk @@ -15,7 +15,7 @@ rule hifiasm: resources: mem_mb=250000 container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm0.16.1" + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm:0.19.6" shell: "hifiasm -l3 -o {params.prefix} -t {threads} {input}" @@ -38,7 +38,7 @@ rule hifiasm_hic: resources: mem_mb=250000 container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm0.16.1" + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm:0.19.6" shell: "hifiasm -l3 -o {params.prefix} -t {threads} --h1 {input.r1} --h2 {input.r2} {input.hifi}" @@ -75,7 +75,7 @@ rule hifiasm_trio: resources: mem_mb=250000 container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm0.16.1" + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm:0.19.6" shell: "hifiasm -o {params.prefix} -t {threads} -1 {input.p1} -2 {input.p2} {input.child}" -- GitLab From fe9edf0151ec7325dbc110495aef2773bbe51e8f Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 20 Sep 2024 11:31:22 +0200 Subject: [PATCH 020/178] update fastqc to v0.12.1 --- .config/masterconfig.yaml | 16 ++++++++++++---- .gitlab-ci.yml | 28 ++++++---------------------- workflow/rules/01_qc.smk | 4 ++-- workflow/rules/03_asm_qc.smk | 2 +- 4 files changed, 21 insertions(+), 29 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 6f0f25a..82690e8 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,9 +1,9 @@ # absolute path to your desired output path -root: . +root: /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output ####################### optional prejob - data preparation ####################### # path to tar data -data: /path +data: /mnt/cbib/pangenoak_trials/GenomAsm4pg/ # list of tar names get_all_tar_filename: False tarIDS: "tar_filename" @@ -11,14 +11,22 @@ tarIDS: "tar_filename" ####################### job - workflow ####################### # number of threads used by pigz pigz_threads: 4 +# k-mers size (reduce for small datasets) +km_size: 21 #TODO ### CONFIG +IDS: ["toy_dataset_bug"] - +toy_dataset_bug: + fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz" + run: toy_test_run + ploidy: 2 + busco_lineage: arthropoda_odb10 + mode: default ####################### workflow output directories ####################### # results directory -resdir: workflow_results +resdir: results ### PREJOB # extracted input data diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 45c0476..806be8b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,29 +1,13 @@ -stages: - - setup - - test - -# Job to set up the environment and install dependencies -setup_environment: - stage: setup - image: continuumio/miniconda3 +test_job: + stage: test + image: continuumio/miniconda3 script: - conda install -c conda-forge mamba # Install Mamba - mamba create -n snakemake -c conda-forge -c bioconda snakemake # Create environment and install Snakemake - echo "source activate snakemake" > ~/.bashrc # Ensure the environment is activated - source activate snakemake - snakemake --version - artifacts: - paths: - - .conda/ - - .snakemake/ - -test_job: - stage: test - image: continuumio/miniconda3 - script: - - git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git - - touch test_data.fasta.gz - cd GenomAsm4pg - - source activate snakemake - dependencies: - - setup_environment \ No newline at end of file + - touch test_data.fasta.gz + - source activate snakemake + - echo "test over" \ No newline at end of file diff --git a/workflow/rules/01_qc.smk b/workflow/rules/01_qc.smk index bf963a2..4f7e0b2 100644 --- a/workflow/rules/01_qc.smk +++ b/workflow/rules/01_qc.smk @@ -28,11 +28,11 @@ rule fastqc: priority: 1 threads: 4 container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/fastqc0.11.5" + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/fastqc:0.12.1" shell: "fastqc -o {params.output_path} {input}" - ### read stats +### read stats rule genometools_on_raw_data: input: diff --git a/workflow/rules/03_asm_qc.smk b/workflow/rules/03_asm_qc.smk index 8c37ac9..0cc03e2 100644 --- a/workflow/rules/03_asm_qc.smk +++ b/workflow/rules/03_asm_qc.smk @@ -33,7 +33,7 @@ rule busco: resources: mem_mb=100000 container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/busco5.3.1" + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/busco:5.7.1" shell: "busco -f -i {input[0]} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads}" -- GitLab From 6d8ccae79817fd8f0bb6cc3eee906bf098a05654 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 20 Sep 2024 11:46:35 +0200 Subject: [PATCH 021/178] update busco to v5.7.1 --- workflow/rules/03_asm_qc.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/03_asm_qc.smk b/workflow/rules/03_asm_qc.smk index 0cc03e2..278317e 100644 --- a/workflow/rules/03_asm_qc.smk +++ b/workflow/rules/03_asm_qc.smk @@ -17,7 +17,7 @@ use rule genometools_on_raw_data as genometools_on_assembly with: output: res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/assembly_stats/{id}_hap{n}.AStats.txt" -### BUSCO stats on assembly +### BUSCO stats on assembly (may not work on first run, rerun the WF) rule busco: input: rules.unzip_hap_fasta.output -- GitLab From bc72c3c3715fdcf162184a8658cc5dc42812aceb Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 23 Sep 2024 17:53:06 +0200 Subject: [PATCH 022/178] untangle the yak image --- workflow/rules/02_asm.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/02_asm.smk b/workflow/rules/02_asm.smk index c86b923..626a927 100644 --- a/workflow/rules/02_asm.smk +++ b/workflow/rules/02_asm.smk @@ -53,7 +53,7 @@ rule yak: benchmark: abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_yak_benchmark.txt" container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm0.16.1" + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/yak:0.1" shell: "yak count -k31 -b37 -t16 -o {output.p1} {input.p1} && " "yak count -k31 -b37 -t16 -o {output.p2} {input.p2}" -- GitLab From 03ce93af1557e77790816e7c9c6357ddde829971 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 23 Sep 2024 17:55:21 +0200 Subject: [PATCH 023/178] clear the CICD test --- .gitlab-ci.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 806be8b..2166f91 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,12 +2,4 @@ test_job: stage: test image: continuumio/miniconda3 script: - - conda install -c conda-forge mamba # Install Mamba - - mamba create -n snakemake -c conda-forge -c bioconda snakemake # Create environment and install Snakemake - - echo "source activate snakemake" > ~/.bashrc # Ensure the environment is activated - - source activate snakemake - - snakemake --version - - cd GenomAsm4pg - - touch test_data.fasta.gz - - source activate snakemake - - echo "test over" \ No newline at end of file + - echo "Temporarily clear this file to reduce computational load on every push." -- GitLab From cd998a214fa3f4e17100167e24fa9a8d4606c676 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 25 Sep 2024 11:52:51 +0200 Subject: [PATCH 024/178] add option to output dag --- job.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/job.sh b/job.sh index 8438271..0fb7b32 100644 --- a/job.sh +++ b/job.sh @@ -61,11 +61,16 @@ if [ "$1" = "dry" ] then # dry run snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -n -r +elif [ "$1" = "dag" ] +then + # generate DAG + snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG --dag > dag.dot + echo "DAG has been generated as dag.png" elif [ -z "$1" ] then # run snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG else - echo "Error: Invalid argument. Use 'dry' or no argument." >&2 + echo "Error: Invalid argument. Use 'dry', 'dag', or no argument." >&2 exit 1 fi \ No newline at end of file -- GitLab From 8edd68b648ed4a808098ccc0a12cf31fc2527360 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 25 Sep 2024 11:57:07 +0200 Subject: [PATCH 025/178] add values for LTR_retiever --- .config/snakemake_profile/slurm/cluster_config.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.config/snakemake_profile/slurm/cluster_config.yml b/.config/snakemake_profile/slurm/cluster_config.yml index db27b01..fea0e13 100644 --- a/.config/snakemake_profile/slurm/cluster_config.yml +++ b/.config/snakemake_profile/slurm/cluster_config.yml @@ -17,6 +17,10 @@ __default__: convert_to_fasta: cpus-per-task: 10 +# LTR_retriever +LTR_retriever: + cpus-per-task: 10 + # BUSCO busco: mem: "100G" -- GitLab From 2212842f79779485272348b8e1b5e768c69e04b3 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 25 Sep 2024 11:58:57 +0200 Subject: [PATCH 026/178] include LAI to the all rule --- workflow/Snakefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 6bc7660..951f834 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -68,7 +68,6 @@ REP_TRIO_ID = for_report_trio(IDS) RUNID_TRIO = run_id(REP_TRIO_ID) BUSCO_LIN_TRIO = busco_lin(REP_TRIO_ID) - report_trio_output = expand(res_path + "/{runid}/report_trio_{id}.{lin}.html", zip, runid=RUNID_TRIO, id=REP_TRIO_ID, lin = BUSCO_LIN_TRIO) @@ -102,6 +101,12 @@ time = expand(res_path + "/{runid}/runtime.{id}.{lin}.txt", zip, time_trio = expand(res_path + "/{runid}/runtime_trio.{id}.{lin}.txt", zip, runid = RUNID_TRIO, id=REP_TRIO_ID, lin=BUSCO_LIN_TRIO) +# LAI output +lai = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.out.LAI", + runid=RUNID_REG, id=REP_ID, n=["1", "2"]) +lai_purge = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/LAI/purge_{id}_hap{n}.out.LAI", + runid=RUNID_REG, id=REP_ID, n=["1", "2"]) + rule_all_input_list = [ longqc_output, fastqc_output, @@ -116,7 +121,9 @@ rule_all_input_list = [ busco_trio, busco_purged_trio, time, - time_trio + time_trio, + lai, + lai_purge ] #### target files -- GitLab From 7a560def23b8f1405358a4b1f4e20e1b33b8d2b5 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 25 Sep 2024 12:00:17 +0200 Subject: [PATCH 027/178] add LTR Assembly Index to the report --- workflow/scripts/report.Rmd | 41 ++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/report.Rmd b/workflow/scripts/report.Rmd index 4fef19e..7a9fb5f 100644 --- a/workflow/scripts/report.Rmd +++ b/workflow/scripts/report.Rmd @@ -85,6 +85,25 @@ cat(readLines(snakemake@input[["tel_1"]]), sep = '\n') cat(readLines(snakemake@input[["tel_2"]]), sep = '\n') ``` +### LTR Assembly Index (LAI) +#### Hap 1 +LTR recap +```{r comment='', echo=FALSE} +cat(head(readLines(snakemake@input[["LRT_recap_1"]]), 50), sep = '\n') +``` +LAI +```{r comment='', echo=FALSE} +cat(head(readLines(snakemake@input[["LAI_1"]]), 2), sep = '\n') +``` +#### Hap 2 +LTR recap +```{r comment='', echo=FALSE} +cat(head(readLines(snakemake@input[["LRT_recap_2"]]), 50), sep = '\n') +``` +LAI +```{r comment='', echo=FALSE} +cat(head(readLines(snakemake@input[["LAI_2"]]), 2), sep = '\n') +``` ## Assembly QC - After Purge_dups ### Assembly statistics @@ -134,4 +153,24 @@ cat(readLines(snakemake@input[["P_tel_1"]]), sep = '\n') #### Hap 2 ```{r comment='', echo=FALSE} cat(readLines(snakemake@input[["P_tel_2"]]), sep = '\n') -``` \ No newline at end of file +``` + +### LTR Assembly Index (LAI) +#### Hap 1 +LTR recap +```{r comment='', echo=FALSE} +cat(head(readLines(snakemake@input[["P_LRT_recap_1"]]), 50), sep = '\n') +``` +LAI +```{r comment='', echo=FALSE} +cat(head(readLines(snakemake@input[["P_LAI_1"]]), 2), sep = '\n') +``` +#### Hap 2 +LTR recap +```{r comment='', echo=FALSE} +cat(head(readLines(snakemake@input[["P_LRT_recap_2"]]), 50), sep = '\n') +``` +LAI +```{r comment='', echo=FALSE} +cat(head(readLines(snakemake@input[["P_LAI_2"]]), 2), sep = '\n') +``` -- GitLab From c6d668656af3066a6f66ca7eabfcbb905b8317d1 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 25 Sep 2024 12:02:31 +0200 Subject: [PATCH 028/178] add rules for LTR Assembly Index --- workflow/rules/03_asm_qc.smk | 35 ++++++++++++++++++++++++++++- workflow/rules/05_purged_asm_qc.smk | 18 ++++++++++++++- workflow/rules/07_report.smk | 8 +++++++ 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/workflow/rules/03_asm_qc.smk b/workflow/rules/03_asm_qc.smk index 278317e..ca712f9 100644 --- a/workflow/rules/03_asm_qc.smk +++ b/workflow/rules/03_asm_qc.smk @@ -66,4 +66,37 @@ rule find_telomeres: container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/biopython1.75" shell: - "python3 workflow/scripts/FindTelomeres.py {input} > {output}" \ No newline at end of file + "python3 workflow/scripts/FindTelomeres.py {input} > {output}" + +rule LTR_finder: + input: + rules.unzip_hap_fasta.output + output: + res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.scn" + singularity: + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/ltr_finder:latest" + shell: + "ltr_finder -C {input} > {output}" + +rule LTR_retriever: + input: + scn=rules.LTR_finder.output, + genome=rules.unzip_hap_fasta.output + output: + lai=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.out.LAI", + recap=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/recap_{id}_hap{n}.tbl" + params: + prefix="{id}_hap{n}" + threads: 10 + singularity: + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/ltr_retriever:3.0.1" + shell: + 'export PATH="/opt/LTR_retriever:$PATH" && ' + 'LTR_retriever -threads {threads} -genome {input.genome} -infinder {input.scn} && ' + 'mv {params.prefix}.fa.out.LAI {output.lai} && ' + 'mv {params.prefix}.fa.tbl {output.recap} && ' + 'rm {params.prefix}.fa?* && ' + 'rm -rf .RepeatMaskerCache &&' + 'rm {params.prefix}.fa' + + \ No newline at end of file diff --git a/workflow/rules/05_purged_asm_qc.smk b/workflow/rules/05_purged_asm_qc.smk index 708f538..945c382 100644 --- a/workflow/rules/05_purged_asm_qc.smk +++ b/workflow/rules/05_purged_asm_qc.smk @@ -42,4 +42,20 @@ rule purge_find_telomeres: container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/biopython1.75" shell: - "python3 workflow/scripts/FindTelomeres.py {input} > {output}" \ No newline at end of file + "python3 workflow/scripts/FindTelomeres.py {input} > {output}" + +use rule LTR_finder as purge_LTR_finder with : + input: + rules.purge_dups.output.purge + output: + res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/LAI/purge_{id}_hap{n}.scn" + +use rule LTR_retriever as purge_LTR_retriever with : + input: + scn=rules.purge_LTR_finder.output, + genome=rules.purge_dups.output.purge + output: + lai=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/LAI/purge_{id}_hap{n}.out.LAI", + recap=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/LAI/purge_recap_{id}_hap{n}.tbl" + params: + prefix="{id}_hap{n}.purged" \ No newline at end of file diff --git a/workflow/rules/07_report.smk b/workflow/rules/07_report.smk index e84bc94..644f72a 100644 --- a/workflow/rules/07_report.smk +++ b/workflow/rules/07_report.smk @@ -19,6 +19,10 @@ rule report: kplot_2 = ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", tel_1 = ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", tel_2 = ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", + LRT_recap_1 = ASM_QC + "/LAI/recap_{id}_hap1.tbl", + LAI_1 = ASM_QC + "/LAI/{id}_hap1.out.LAI", + LRT_recap_2 = ASM_QC + "/LAI/recap_{id}_hap2.tbl", + LAI_2 = ASM_QC + "/LAI/{id}_hap2.out.LAI", merq_comp = rules.merqury.output.stat, merq_err = rules.merqury.output.qv, # after purge_dups assembly QC @@ -30,6 +34,10 @@ rule report: P_kplot_2 = P_ASM_QC + "/katplot/hap2/{id}_purged_hap2.katplot.png", P_tel_1 = P_ASM_QC + "/telomeres/{id}_hap1_purged_telomeres.txt", P_tel_2 = P_ASM_QC + "/telomeres/{id}_hap2_purged_telomeres.txt", + P_LRT_recap_1 = P_ASM_QC + "/LAI/purge_recap_{id}_hap1.tbl", + P_LAI_1 = P_ASM_QC + "/LAI/purge_{id}_hap1.out.LAI", + P_LRT_recap_2 = P_ASM_QC + "/LAI/purge_recap_{id}_hap2.tbl", + P_LAI_2 = P_ASM_QC + "/LAI/purge_{id}_hap2.out.LAI", P_merq_comp = rules.purge_merqury.output.stat, P_merq_err = rules.purge_merqury.output.qv output: -- GitLab From 271d8e0970f281e61faa1bd6d64d6bc5f9d9acd7 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 15 Oct 2024 12:01:43 +0200 Subject: [PATCH 029/178] add options for non purged datasets --- .config/masterconfig.yaml | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 82690e8..c3c6030 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,27 +1,38 @@ # absolute path to your desired output path -root: /mnt/cbib/pangenoak_trials/GenomAsm4pg/tutorial_output +root: /mnt/cbib/pangenoak_trials/GenomAsm4pg ####################### optional prejob - data preparation ####################### # path to tar data -data: /mnt/cbib/pangenoak_trials/GenomAsm4pg/ +data: /mnt/cbib/pangenoak_trails/GenomAsm4pg # list of tar names get_all_tar_filename: False -tarIDS: "tar_filename" +tarIDS: "<tar_filenames>" ####################### job - workflow ####################### # number of threads used by pigz pigz_threads: 4 # k-mers size (reduce for small datasets) -km_size: 21 #TODO +km_size: 21 #TODO, would be nice if that was modulable -### CONFIG -IDS: ["toy_dataset_bug"] +### CONFIG +IDS: ["purge", "no_purge"] -toy_dataset_bug: - fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/lady_bug_data.fasta.gz" - run: toy_test_run +purge: + run: purge_test + fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/yeast_data.fasta.gz" ploidy: 2 - busco_lineage: arthropoda_odb10 + busco_lineage: saccharomycetes_odb10 + assembly_purge_force: 0 #[0-3] + purge_dups : False + mode: default + +no_purge: + run: purge_test + fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/yeast_data.fasta.gz" + ploidy: 2 + busco_lineage: saccharomycetes_odb10 + assembly_purge_force: 3 #[0-3] + purge_dups : True mode: default ####################### workflow output directories ####################### -- GitLab From bd25f9e9f6da928ccba94c60bda4e530833c855f Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 15 Oct 2024 13:36:50 +0200 Subject: [PATCH 030/178] add suport for no purge WFs --- workflow/rules/00_runtime.smk | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/workflow/rules/00_runtime.smk b/workflow/rules/00_runtime.smk index 2c408ba..b7c7b56 100644 --- a/workflow/rules/00_runtime.smk +++ b/workflow/rules/00_runtime.smk @@ -12,6 +12,26 @@ rule elasped_time: input: rules.start_time.output, rules.rename_report.output + output: + res_path + "/{runid}/p_runtime.{id}.{lin}.txt" + run: + import time + from datetime import timedelta + + with open(input[0], "r") as inp: + start = inp.read() + + end = time.time() + elapsed_time = end - float(start) + td = timedelta(seconds=elapsed_time) + + with open(output[0], "w") as out: + out.write("Runtime (hh:mm:ss): " + str(td)) + +rule elasped_time_no_purge: + input: + rules.start_time.output, + rules.rename_no_purge_report.output output: res_path + "/{runid}/runtime.{id}.{lin}.txt" run: @@ -33,6 +53,26 @@ rule elasped_time_trio: input: rules.start_time.output, rules.rename_report_trio.output + output: + res_path + "/{runid}/p_runtime_trio.{id}.{lin}.txt" + run: + import time + from datetime import timedelta + + with open(input[0], "r") as inp: + start = inp.read() + + end = time.time() + elapsed_time = end - float(start) + td = timedelta(seconds=elapsed_time) + + with open(output[0], "w") as out: + out.write("Runtime (hh:mm:ss): " + str(td)) + +rule elasped_time_trio_no_purge: + input: + rules.start_time.output, + rules.no_purge_rename_report_trio.output output: res_path + "/{runid}/runtime_trio.{id}.{lin}.txt" run: -- GitLab From 40d0c802c4acd7b8e99c59f50fed918e156a6cff Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 15 Oct 2024 13:44:46 +0200 Subject: [PATCH 031/178] add function to retrive the purge parameters --- workflow/scripts/from_config/parameters.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/from_config/parameters.py b/workflow/scripts/from_config/parameters.py index a68438d..71c54f2 100644 --- a/workflow/scripts/from_config/parameters.py +++ b/workflow/scripts/from_config/parameters.py @@ -35,4 +35,16 @@ def get_fastq(wildcards): def get_bam(wildcards): id_name = wildcards.Bid fq = config[f'{id_name}']["bam"] - return(fq) \ No newline at end of file + return(fq) + +# Fetch the purge mode, return a boolean from config file +def get_purge(wildcards): + id_name = wildcards.id + purge_bool = config[f'{id_name}']["purge_dups"] + return purge_bool + +# Fetch the purge level for hifiasm, return a boolean from config file +def get_purge(wildcards): + id_name = wildcards.id + force = config[f'{id_name}']["assembly_purge_force"] + return force \ No newline at end of file -- GitLab From 4efbe2602b08f677dbef0626089b230d6f0e0a7a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 15 Oct 2024 13:50:20 +0200 Subject: [PATCH 032/178] Add support for purge_dups as module --- workflow/Snakefile | 82 ++++---- workflow/rules/07_report.smk | 202 ++++++++++++++------ workflow/scripts/from_config/target_list.py | 16 +- workflow/scripts/report.Rmd | 78 ++++---- workflow/scripts/report_trio.Rmd | 81 ++++---- 5 files changed, 258 insertions(+), 201 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 951f834..6135456 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,14 +1,12 @@ configfile: ".config/masterconfig.yaml" -###### Include all scripts & rules necessary to run the workflow ###### -### Scripts -# get parameters from masterconfig +# Include all the scripts include: "scripts/from_config/hifiasm_mode.py" include: "scripts/from_config/parameters.py" include: "scripts/from_config/target_list.py" include: "scripts/path_helper.py" -### paths +# Get paths to the WD if config["root"].startswith("."): abs_root_path = get_abs_root_path() res_path = get_res_path() @@ -16,96 +14,96 @@ else: abs_root_path = config["root"] res_path = abs_root_path + "/" + config["resdir"] - -### Rules -## PRE ASSEMBLY QC +# Include all the rules include: "rules/01_qc.smk" -## ASSEMBLY include: "rules/02_asm.smk" -# Statistics include: "rules/03_asm_qc.smk" include: "rules/03.5_asm_qc_merqury.smk" -# Purging include: "rules/04_purge_dups.smk" include: "rules/05_purged_asm_qc.smk" include: "rules/05.5_purged_asm_qc_merqury.smk" -# Link final assembly include: "rules/06_sym_link_hap.smk" -## AUTOMATIC REPORT include: "rules/07_report.smk" - -## runtime include: "rules/00_runtime.smk" -###### get filenames for workflow ###### +# Get the filenames of inputs IDS=config["IDS"] bamIDS=check_bam(IDS) fastqIDS=check_fastq(IDS) -#### + RUNID = run_id(config["IDS"]) BID_RUN = run_BFid(bamIDS) FID_RUN = run_BFid(fastqIDS) -###### results path ###### - -###### Target files ###### -## raw data stats +# Create the list of desired outputs +## For raw data longqc_output = expand(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC", zip, run=BID_RUN, Bid=bamIDS), fastqc_output = expand(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc.{ext}", zip, run=FID_RUN, Fid=fastqIDS, ext=["html", "zip"]) -### REPORT +## Reports REP_ID = for_report(IDS) RUNID_REG = run_id(REP_ID) BUSCO_LIN = busco_lin(REP_ID) +### We create additional lists for purge_dups applications +PURGE_ID = for_purge(IDS) +RUNID_PURGE = run_id(PURGE_ID) +BUSCO_LIN_PURGE = busco_lin(PURGE_ID) + +purged_report_output = expand(res_path + "/{runid}/p_report_{id}.{lin}.html", zip, + runid=RUNID_PURGE, id=PURGE_ID, lin = BUSCO_LIN_PURGE ) report_output = expand(res_path + "/{runid}/report_{id}.{lin}.html", zip, - runid=RUNID_REG, id=REP_ID, lin = BUSCO_LIN ) + runid=RUNID_REG, id=REP_ID, lin = BUSCO_LIN) -### REPORT TRIO -REP_TRIO_ID = for_report_trio(IDS) +### Same thing for trio +REP_TRIO_ID = for_report(IDS, trio = True) RUNID_TRIO = run_id(REP_TRIO_ID) BUSCO_LIN_TRIO = busco_lin(REP_TRIO_ID) +### We create additional lists for purge_dups applications +PURGE_ID_TRIO = for_purge(IDS, trio = True) +RUNID_PURGE_TRIO = run_id(PURGE_ID) +BUSCO_LIN_TRIO_PURGE = busco_lin(PURGE_ID_TRIO) + +purged_report_trio_output = expand(res_path + "/{runid}/p_report_trio_{id}.{lin}.html", zip, + runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin = BUSCO_LIN_TRIO_PURGE) report_trio_output = expand(res_path + "/{runid}/report_trio_{id}.{lin}.html", zip, runid=RUNID_TRIO, id=REP_TRIO_ID, lin = BUSCO_LIN_TRIO) -### SYM LINK -## symbolic link to final assembly +# Add symbolic link to final assembly symb_link1 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, - runid=RUNID_REG, id=REP_ID, n=["1", "2"]) + runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"]) symb_link2 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, - runid=RUNID_TRIO, id=REP_TRIO_ID, n=["1", "2"]) - + runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"]) ## PURGE_DUPS CUTOFFS GRAPH cut_eval1 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, - runid=RUNID_REG, id=REP_ID, n=["1", "2"]) + runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"]) cut_eval2 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, - runid=RUNID_TRIO, id=REP_TRIO_ID, n=["1", "2"]) + runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"]) ## BUSCO busco_reg = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt", zip, runid=RUNID_REG, id=REP_ID, n=["1", "2"], lin = BUSCO_LIN) + busco_purged_reg = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", zip, - runid=RUNID_REG, id=REP_ID, n=["1", "2"], lin = BUSCO_LIN) + runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"], lin = BUSCO_LIN) busco_trio = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt", zip, runid=RUNID_TRIO, id=REP_TRIO_ID, n=["1", "2"], lin = BUSCO_LIN_TRIO) busco_purged_trio = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", zip, - runid=RUNID_TRIO, id=REP_TRIO_ID, n=["1", "2"], lin = BUSCO_LIN_TRIO) + runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"], lin = BUSCO_LIN_TRIO) ## RUNTIME time = expand(res_path + "/{runid}/runtime.{id}.{lin}.txt", zip, runid = RUNID_REG, id=REP_ID, lin=BUSCO_LIN) time_trio = expand(res_path + "/{runid}/runtime_trio.{id}.{lin}.txt", zip, runid = RUNID_TRIO, id=REP_TRIO_ID, lin=BUSCO_LIN_TRIO) - -# LAI output -lai = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.out.LAI", - runid=RUNID_REG, id=REP_ID, n=["1", "2"]) -lai_purge = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/LAI/purge_{id}_hap{n}.out.LAI", - runid=RUNID_REG, id=REP_ID, n=["1", "2"]) +time_purge = expand(res_path + "/{runid}/p_runtime.{id}.{lin}.txt", zip, + runid = RUNID_PURGE, id=PURGE_ID, lin=BUSCO_LIN) +time_trio_purge = expand(res_path + "/{runid}/p_runtime_trio.{id}.{lin}.txt", zip, + runid = RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin=BUSCO_LIN_TRIO) rule_all_input_list = [ longqc_output, @@ -122,8 +120,10 @@ rule_all_input_list = [ busco_purged_trio, time, time_trio, - lai, - lai_purge + time_trio_purge, + time_purge, + purged_report_output, + purged_report_trio_output ] #### target files diff --git a/workflow/rules/07_report.smk b/workflow/rules/07_report.smk index 644f72a..95d9de7 100644 --- a/workflow/rules/07_report.smk +++ b/workflow/rules/07_report.smk @@ -7,25 +7,22 @@ P_ASM_QC = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/ rule report: input: - # reads QC - genomescope = RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", - gt_reads = RAW_QC + "/03_genometools/{id}.RawStat.txt", - # hifiasm assembly QC - gt_asm_1 = ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", - gt_asm_2 = ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", - busco_1 = ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", - busco_2 = ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", - kplot_1 = ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", - kplot_2 = ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", - tel_1 = ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", - tel_2 = ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", - LRT_recap_1 = ASM_QC + "/LAI/recap_{id}_hap1.tbl", - LAI_1 = ASM_QC + "/LAI/{id}_hap1.out.LAI", - LRT_recap_2 = ASM_QC + "/LAI/recap_{id}_hap2.tbl", - LAI_2 = ASM_QC + "/LAI/{id}_hap2.out.LAI", - merq_comp = rules.merqury.output.stat, - merq_err = rules.merqury.output.qv, - # after purge_dups assembly QC + genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", + gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", + gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", + gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", + busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", + busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", + kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", + kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", + tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", + tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", + LRT_recap_1=ASM_QC + "/LAI/recap_{id}_hap1.tbl", + LAI_1=ASM_QC + "/LAI/{id}_hap1.out.LAI", + LRT_recap_2=ASM_QC + "/LAI/recap_{id}_hap2.tbl", + LAI_2=ASM_QC + "/LAI/{id}_hap2.out.LAI", + merq_comp=rules.merqury.output.stat, + merq_err=rules.merqury.output.qv, P_gt_asm_1 = P_ASM_QC + "/assembly_stats/{id}_purged_hap1.AStats.txt", P_gt_asm_2 = P_ASM_QC + "/assembly_stats/{id}_purged_hap2.AStats.txt", P_busco_1 = P_ASM_QC + "/busco/{id}_purged_hap1/short_summary.specific.{lin}.{id}_purged_hap1.txt", @@ -46,14 +43,56 @@ rule report: id="{id}", mode=get_mode, run=get_run, + purge=get_purge, container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: "../scripts/report.Rmd" + rule rename_report: input: rules.report.output + output: + res_path + "/{runid}/p_report_{id}.{lin}.html" + shell: + "mv {input} {output}" + +rule no_purge_report: + input: + # Reads QC + genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", + gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", + # Hifiasm assembly QC + gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", + gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", + busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", + busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", + kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", + kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", + tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", + tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", + LRT_recap_1=ASM_QC + "/LAI/recap_{id}_hap1.tbl", + LAI_1=ASM_QC + "/LAI/{id}_hap1.out.LAI", + LRT_recap_2=ASM_QC + "/LAI/recap_{id}_hap2.tbl", + LAI_2=ASM_QC + "/LAI/{id}_hap2.out.LAI", + merq_comp=rules.merqury.output.stat, + merq_err=rules.merqury.output.qv + output: + res_path + "/{runid}/{id}/{lin}/report.html" + params: + id="{id}", + mode=get_mode, + run=get_run, + purge=get_purge, + container: + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" + script: + "../scripts/report.Rmd" + +rule rename_no_purge_report: + input: + rules.no_purge_report.output output: res_path + "/{runid}/report_{id}.{lin}.html" shell: @@ -61,59 +100,100 @@ rule rename_report: rule report_trio: input: - ### collect files to include in report - # reads QC - genomescope = RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", - gt_reads = RAW_QC + "/03_genometools/{id}.RawStat.txt", - # hifiasm assembly QC - gt_asm_1 = ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", - gt_asm_2 = ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", - busco_1 = ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", - busco_2 = ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", - kplot_1 = ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", - kplot_2 = ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", - tel_1 = ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", - tel_2 = ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", - merq_comp = ASM_QC + "/merqury/{id}_merqury_trio.completeness.stats", - merq_err = ASM_QC + "/merqury/{id}_merqury_trio.qv", - merq_blob = ASM_QC + "/merqury/{id}_merqury_trio.hapmers.blob.png", - merq_block_1 = ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.block.N.png", - merq_block_2 = ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.block.N.png", - merq_block_stats_1 = ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.100_20000.phased_block.stats", - merq_block_stats_2 = ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.100_20000.phased_block.stats", - # after purge_dups assembly QC - P_gt_asm_1 = P_ASM_QC + "/assembly_stats/{id}_purged_hap1.AStats.txt", - P_gt_asm_2 = P_ASM_QC + "/assembly_stats/{id}_purged_hap2.AStats.txt", - P_busco_1 = P_ASM_QC + "/busco/{id}_purged_hap1/short_summary.specific.{lin}.{id}_purged_hap1.txt", - P_busco_2 = P_ASM_QC + "/busco/{id}_purged_hap2/short_summary.specific.{lin}.{id}_purged_hap2.txt", - P_kplot_1 = P_ASM_QC + "/katplot/hap1/{id}_purged_hap1.katplot.png", - P_kplot_2 = P_ASM_QC + "/katplot/hap2/{id}_purged_hap2.katplot.png", - P_tel_1 = P_ASM_QC + "/telomeres/{id}_hap1_purged_telomeres.txt", - P_tel_2 = P_ASM_QC + "/telomeres/{id}_hap2_purged_telomeres.txt", - P_merq_comp = P_ASM_QC + "/merqury/{id}_purge_merqury_trio.completeness.stats", - P_merq_err = P_ASM_QC + "/merqury/{id}_purge_merqury_trio.qv", - P_merq_blob = P_ASM_QC + "/merqury/{id}_purge_merqury_trio.hapmers.blob.png", - P_merq_block_1 = P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap1.purged.block.N.png", - P_merq_block_2 = P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap2.purged.block.N.png", - P_merq_block_stats_1 = P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap1.purged.100_20000.phased_block.stats", - P_merq_block_stats_2 = P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap2.purged.100_20000.phased_block.stats", + # Reads QC + genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", + gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", + # Hifiasm assembly QC + gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", + gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", + busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", + busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", + kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", + kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", + tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", + tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", + merq_comp=ASM_QC + "/merqury/{id}_merqury_trio.completeness.stats", + merq_err=ASM_QC + "/merqury/{id}_merqury_trio.qv", + merq_blob=ASM_QC + "/merqury/{id}_merqury_trio.hapmers.blob.png", + merq_block_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.block.N.png", + merq_block_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.block.N.png", + merq_block_stats_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.100_20000.phased_block.stats", + merq_block_stats_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.100_20000.phased_block.stats", + P_gt_asm_1=P_ASM_QC + "/assembly_stats/{id}_purged_hap1.AStats.txt", + P_gt_asm_2=P_ASM_QC + "/assembly_stats/{id}_purged_hap2.AStats.txt", + P_busco_1=P_ASM_QC + "/busco/{id}_purged_hap1/short_summary.specific.{lin}.{id}_purged_hap1.txt", + P_busco_2=P_ASM_QC + "/busco/{id}_purged_hap2/short_summary.specific.{lin}.{id}_purged_hap2.txt", + P_kplot_1=P_ASM_QC + "/katplot/hap1/{id}_purged_hap1.katplot.png", + P_kplot_2=P_ASM_QC + "/katplot/hap2/{id}_purged_hap2.katplot.png", + P_tel_1=P_ASM_QC + "/telomeres/{id}_hap1_purged_telomeres.txt", + P_tel_2=P_ASM_QC + "/telomeres/{id}_hap2_purged_telomeres.txt", + P_merq_comp=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.completeness.stats", + P_merq_err=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.qv", + P_merq_blob=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.hapmers.blob.png", + P_merq_block_1=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap1.purged.block.N.png", + P_merq_block_2=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap2.purged.block.N.png", + P_merq_block_stats_1=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap1.purged.100_20000.phased_block.stats", + P_merq_block_stats_2=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap2.purged.100_20000.phased_block.stats" output: res_path + "/{runid}/{id}/{lin}/report_trio.html" params: - id = "{id}", # get filename - mode = get_mode, # get assembly mode - p1 = get_p1, - p2 = get_p2, - run = get_run + id="{id}", # get filename + mode=get_mode, # get assembly mode + p1=get_p1, + p2=get_p2, + run=get_run, + purge=get_purge, container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: "../scripts/report_trio.Rmd" - rule rename_report_trio: input: rules.report_trio.output + output: + res_path + "/{runid}/p_report_trio_{id}.{lin}.html" + shell: + "mv {input} {output}" + +rule no_purge_report_trio: + input: + # Reads QC + genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", + gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", + # Hifiasm assembly QC + gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", + gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", + busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", + busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", + kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", + kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", + tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", + tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", + merq_comp=ASM_QC + "/merqury/{id}_merqury_trio.completeness.stats", + merq_err=ASM_QC + "/merqury/{id}_merqury_trio.qv", + merq_blob=ASM_QC + "/merqury/{id}_merqury_trio.hapmers.blob.png", + merq_block_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.block.N.png", + merq_block_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.block.N.png", + merq_block_stats_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.100_20000.phased_block.stats", + merq_block_stats_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.100_20000.phased_block.stats" + output: + res_path + "/{runid}/{id}/{lin}/report_trio.html" + params: + id="{id}", # get filename + mode=get_mode, # get assembly mode + p1=get_p1, + p2=get_p2, + run=get_run, + purge=get_purge, + container: + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" + script: + "../scripts/report_trio.Rmd" + +rule no_purge_rename_report_trio: + input: + rules.no_purge_report_trio.output output: res_path + "/{runid}/report_trio_{id}.{lin}.html" shell: diff --git a/workflow/scripts/from_config/target_list.py b/workflow/scripts/from_config/target_list.py index c322745..adb9f6d 100644 --- a/workflow/scripts/from_config/target_list.py +++ b/workflow/scripts/from_config/target_list.py @@ -19,21 +19,25 @@ def run_BFid(id_list): RUNID = expand("{runid}", runid = run_list) return(RUNID) -#### REPORT -def for_report(id_list): +# Create a list of purge datasets +def for_purge(id_list, trio =False): NAME = [] for i in id_list: mode = config[i]["mode"] - if mode != "trio": + if mode == "trio" and trio and config[i]["purge"]: + NAME.append(i) + elif trio == False and config[i]["purge"]: NAME.append(i) return(NAME) -#### REPORT TRIO -def for_report_trio(id_list): +# Create a list of not_purged datasets +def for_report(id_list, trio =False): NAME = [] for i in id_list: mode = config[i]["mode"] - if mode == "trio": + if mode == "trio" and trio and config[i]["purge"]==False: + NAME.append(i) + elif trio == False and config[i]["purge"]==False: NAME.append(i) return(NAME) diff --git a/workflow/scripts/report.Rmd b/workflow/scripts/report.Rmd index 7a9fb5f..ee36635 100644 --- a/workflow/scripts/report.Rmd +++ b/workflow/scripts/report.Rmd @@ -22,6 +22,7 @@ output: # `r snakemake@params[["id"]]` - run: `r snakemake@params[["run"]]` * Run : `r snakemake@params[["run"]]` * Hifiasm mode : `r snakemake@params[["mode"]]` +* Purge conducted: `r if (snakemake@params[["purge"]]) { "Yes" } else { "No" }` ---- @@ -105,72 +106,61 @@ LAI cat(head(readLines(snakemake@input[["LAI_2"]]), 2), sep = '\n') ``` -## Assembly QC - After Purge_dups -### Assembly statistics -#### Hap 1 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "## Assembly QC - After Purge_dups" }` +`r if (snakemake@params[["purge"]]) { "### Assembly statistics" }` +`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_gt_asm_1"]]), sep = '\n') ``` - -#### Hap 2 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_gt_asm_2"]]), sep = '\n') ``` -### K-mer profiles -| Hap 1 | Hap 2 | -|-------|-------| -|  |  | -### K-mer completeness and error rate -Completeness - -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "### K-mer completeness and error rate" }` +`r if (snakemake@params[["purge"]]) { "Completeness" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_merq_comp"]]), sep = '\n') ``` - -Error rate -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "Error rate " }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_merq_err"]]), sep = '\n') ``` - -### BUSCO -#### Hap 1 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "### BUSCO" }` +`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_busco_1"]]), sep = '\n') ``` -#### Hap 2 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_busco_2"]]), sep = '\n') ``` - -### Telomeres -Telomeres present in assembly - -#### Hap 1 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "### Telomeres" }` +`r if (snakemake@params[["purge"]]) { "Telomeres present in assembly" }` +`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_tel_1"]]), sep = '\n') ``` -#### Hap 2 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_tel_2"]]), sep = '\n') ``` - -### LTR Assembly Index (LAI) -#### Hap 1 -LTR recap -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "### LTR Assembly Index (LAI) " }` +`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` +`r if (snakemake@params[["purge"]]) { "LTR recap" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(head(readLines(snakemake@input[["P_LRT_recap_1"]]), 50), sep = '\n') ``` -LAI -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "LAI" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(head(readLines(snakemake@input[["P_LAI_1"]]), 2), sep = '\n') ``` -#### Hap 2 -LTR recap -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` +`r if (snakemake@params[["purge"]]) { "LTR recap" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(head(readLines(snakemake@input[["P_LRT_recap_2"]]), 50), sep = '\n') ``` -LAI -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "LAI" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(head(readLines(snakemake@input[["P_LAI_2"]]), 2), sep = '\n') ``` diff --git a/workflow/scripts/report_trio.Rmd b/workflow/scripts/report_trio.Rmd index ac0b22c..c06eb3a 100644 --- a/workflow/scripts/report_trio.Rmd +++ b/workflow/scripts/report_trio.Rmd @@ -24,6 +24,7 @@ output: * Hifiasm mode : `r snakemake@params[["mode"]]` * Parent 1 : `r snakemake@params[["p1"]]` * Parent 2 : `r snakemake@params[["p2"]]` +* Purge conducted: `r if (snakemake@params[["purge"]]) { "Yes" } else { "No" }` ---- @@ -111,73 +112,55 @@ cat(readLines(snakemake@input[["tel_2"]]), sep = '\n') ---- -## Assembly QC - After Purge_dups -### Assembly statistics -#### Hap 1 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "## Assembly QC - After Purge_dups" }` +`r if (snakemake@params[["purge"]]) { "### Assembly statistics" }` +`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_gt_asm_1"]]), sep = '\n') ``` - -#### Hap 2 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_gt_asm_2"]]), sep = '\n') ``` -### K-mer profiles -| Hap 1 | Hap 2 | -|-------|-------| -|  |  | -### BUSCO -#### Hap 1 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "### BUSCO" }` +`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_busco_1"]]), sep = '\n') ``` -#### Hap 2 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_busco_2"]]), sep = '\n') ``` +`r if (snakemake@params[["purge"]]) { "### Phasing" }` +`r if (snakemake@params[["purge"]]) { * Parent 1 : `r snakemake@params[["p1"]]` }` +`r if (snakemake@params[["purge"]]) { * Parent 2 : `r snakemake@params[["p2"]]` }` +`r if (snakemake@params[["purge"]]) { `r snakemake@input[["P_merq_blob"]]`}` -### Phasing -* Parent 1 : `r snakemake@params[["p1"]]` -* Parent 2 : `r snakemake@params[["p2"]]` - - - -Blocks and switch error rate - -| Hap 1 | Hap 2 | -|-------|-------| -|  |  | - -#### Hap 1 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_merq_block_stats_1"]]), sep = '\n') ``` -#### Hap 2 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_merq_block_stats_2"]]), sep = '\n') ``` - -### K-mer completeness and error rate -Completeness - -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "### K-mer completeness and error rate" }` +`r if (snakemake@params[["purge"]]) { "Completeness" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_merq_comp"]]), sep = '\n') ``` - -Error rate -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "Error rate " }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_merq_err"]]), sep = '\n') ``` - -### Telomeres -Telomeres present in assembly - -#### Hap 1 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "### Telomeres" }` +`r if (snakemake@params[["purge"]]) { "Telomeres present in assembly" }` +`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_tel_1"]]), sep = '\n') ``` -#### Hap 2 -```{r comment='', echo=FALSE} +`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` +```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} cat(readLines(snakemake@input[["P_tel_2"]]), sep = '\n') -``` +``` \ No newline at end of file -- GitLab From fd60bb5cf22900ac895f3557526abe99c2f966b8 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 15 Oct 2024 14:05:44 +0200 Subject: [PATCH 033/178] Add controle over hifiasm purge option --- workflow/rules/02_asm.smk | 10 ++++++---- workflow/rules/07_report.smk | 4 ++++ workflow/scripts/from_config/parameters.py | 2 +- workflow/scripts/report.Rmd | 1 + workflow/scripts/report_trio.Rmd | 1 + 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/workflow/rules/02_asm.smk b/workflow/rules/02_asm.smk index 626a927..02fa590 100644 --- a/workflow/rules/02_asm.smk +++ b/workflow/rules/02_asm.smk @@ -8,7 +8,8 @@ rule hifiasm: hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap1.p_ctg.gfa", hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap2.p_ctg.gfa" params: - prefix = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" + prefix = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}", + purge_force = get_purge_force benchmark: abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_benchmark.txt" threads: 20 @@ -17,7 +18,7 @@ rule hifiasm: container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm:0.19.6" shell: - "hifiasm -l3 -o {params.prefix} -t {threads} {input}" + "hifiasm -l{params.purge_force} -o {params.prefix} -t {threads} {input}" # HI-C rule hifiasm_hic: @@ -31,7 +32,8 @@ rule hifiasm_hic: hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap1.p_ctg.gfa", hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap2.p_ctg.gfa" params: - prefix= abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" + prefix= abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}", + purge_force = get_purge_force benchmark: abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_hic_benchmark.txt" threads: 20 @@ -40,7 +42,7 @@ rule hifiasm_hic: container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm:0.19.6" shell: - "hifiasm -l3 -o {params.prefix} -t {threads} --h1 {input.r1} --h2 {input.r2} {input.hifi}" + "hifiasm -l{params.purge_force} -o {params.prefix} -t {threads} --h1 {input.r1} --h2 {input.r2} {input.hifi}" # TRIO BINNING rule yak: diff --git a/workflow/rules/07_report.smk b/workflow/rules/07_report.smk index 95d9de7..e108f42 100644 --- a/workflow/rules/07_report.smk +++ b/workflow/rules/07_report.smk @@ -44,6 +44,7 @@ rule report: mode=get_mode, run=get_run, purge=get_purge, + purge_force = str(get_purge_force) container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: @@ -85,6 +86,7 @@ rule no_purge_report: mode=get_mode, run=get_run, purge=get_purge, + purge_force = str(get_purge_force) container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: @@ -143,6 +145,7 @@ rule report_trio: p2=get_p2, run=get_run, purge=get_purge, + purge_force = str(get_purge_force) container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: @@ -186,6 +189,7 @@ rule no_purge_report_trio: p2=get_p2, run=get_run, purge=get_purge, + purge_force = str(get_purge_force) container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: diff --git a/workflow/scripts/from_config/parameters.py b/workflow/scripts/from_config/parameters.py index 71c54f2..b29d5d6 100644 --- a/workflow/scripts/from_config/parameters.py +++ b/workflow/scripts/from_config/parameters.py @@ -44,7 +44,7 @@ def get_purge(wildcards): return purge_bool # Fetch the purge level for hifiasm, return a boolean from config file -def get_purge(wildcards): +def get_purge_force(wildcards): id_name = wildcards.id force = config[f'{id_name}']["assembly_purge_force"] return force \ No newline at end of file diff --git a/workflow/scripts/report.Rmd b/workflow/scripts/report.Rmd index ee36635..8dc0ca0 100644 --- a/workflow/scripts/report.Rmd +++ b/workflow/scripts/report.Rmd @@ -22,6 +22,7 @@ output: # `r snakemake@params[["id"]]` - run: `r snakemake@params[["run"]]` * Run : `r snakemake@params[["run"]]` * Hifiasm mode : `r snakemake@params[["mode"]]` +* Hifiasm purge mode [0-3]: `r snakemake@params[["purge_force"]]` * Purge conducted: `r if (snakemake@params[["purge"]]) { "Yes" } else { "No" }` ---- diff --git a/workflow/scripts/report_trio.Rmd b/workflow/scripts/report_trio.Rmd index c06eb3a..380500c 100644 --- a/workflow/scripts/report_trio.Rmd +++ b/workflow/scripts/report_trio.Rmd @@ -22,6 +22,7 @@ output: # `r snakemake@params[["id"]]` - run: `r snakemake@params[["run"]]` * Run : `r snakemake@params[["run"]]` * Hifiasm mode : `r snakemake@params[["mode"]]` +* Hifiasm purge mode [0-3]: `r snakemake@params[["purge_force"]]` * Parent 1 : `r snakemake@params[["p1"]]` * Parent 2 : `r snakemake@params[["p2"]]` * Purge conducted: `r if (snakemake@params[["purge"]]) { "Yes" } else { "No" }` -- GitLab From f4f522ea571917adc6d2767c204bda42bdd9c444 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 15 Oct 2024 16:29:06 +0200 Subject: [PATCH 034/178] better variable name --- .config/masterconfig.yaml | 22 ++++++++++----------- .gitignore | 1 + workflow/rules/07_report.smk | 2 +- workflow/scripts/from_config/parameters.py | 2 +- workflow/scripts/from_config/target_list.py | 8 ++++---- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index c3c6030..c1769f4 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -12,27 +12,27 @@ tarIDS: "<tar_filenames>" # number of threads used by pigz pigz_threads: 4 # k-mers size (reduce for small datasets) -km_size: 21 #TODO, would be nice if that was modulable +km_size: 21 #Todo, add support for modularity of this value ### CONFIG IDS: ["purge", "no_purge"] purge: - run: purge_test - fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/yeast_data.fasta.gz" + run: purge + fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/Q_alba.fasta.gz" ploidy: 2 - busco_lineage: saccharomycetes_odb10 - assembly_purge_force: 0 #[0-3] - purge_dups : False + busco_lineage: eudicots_odb10 + assembly_purge_force: 3 + run_purge_dups : True mode: default no_purge: - run: purge_test - fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/yeast_data.fasta.gz" + run: no_purge + fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/Q_alba.fasta.gz" ploidy: 2 - busco_lineage: saccharomycetes_odb10 - assembly_purge_force: 3 #[0-3] - purge_dups : True + busco_lineage: eudicots_odb10 + assembly_purge_force: 1 + run_purge_dups : False mode: default ####################### workflow output directories ####################### diff --git a/.gitignore b/.gitignore index 2f4d28f..6a13d23 100644 --- a/.gitignore +++ b/.gitignore @@ -62,3 +62,4 @@ node_modules/* # 4) specific files or folder to TRACK (the '**' sign means 'any path') # 5) specific folders to UNTRACK (wherever it may be in the treeview) +.snakemake/* diff --git a/workflow/rules/07_report.smk b/workflow/rules/07_report.smk index e108f42..18ee0ed 100644 --- a/workflow/rules/07_report.smk +++ b/workflow/rules/07_report.smk @@ -44,7 +44,7 @@ rule report: mode=get_mode, run=get_run, purge=get_purge, - purge_force = str(get_purge_force) + purge_force = str(get_purge_force), container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: diff --git a/workflow/scripts/from_config/parameters.py b/workflow/scripts/from_config/parameters.py index b29d5d6..4487474 100644 --- a/workflow/scripts/from_config/parameters.py +++ b/workflow/scripts/from_config/parameters.py @@ -40,7 +40,7 @@ def get_bam(wildcards): # Fetch the purge mode, return a boolean from config file def get_purge(wildcards): id_name = wildcards.id - purge_bool = config[f'{id_name}']["purge_dups"] + purge_bool = config[f'{id_name}']["run_purge_dups"] return purge_bool # Fetch the purge level for hifiasm, return a boolean from config file diff --git a/workflow/scripts/from_config/target_list.py b/workflow/scripts/from_config/target_list.py index adb9f6d..4cb73b5 100644 --- a/workflow/scripts/from_config/target_list.py +++ b/workflow/scripts/from_config/target_list.py @@ -24,9 +24,9 @@ def for_purge(id_list, trio =False): NAME = [] for i in id_list: mode = config[i]["mode"] - if mode == "trio" and trio and config[i]["purge"]: + if mode == "trio" and trio and config[i]["run_purge_dups"]: NAME.append(i) - elif trio == False and config[i]["purge"]: + elif trio == False and config[i]["run_purge_dups"]: NAME.append(i) return(NAME) @@ -35,9 +35,9 @@ def for_report(id_list, trio =False): NAME = [] for i in id_list: mode = config[i]["mode"] - if mode == "trio" and trio and config[i]["purge"]==False: + if mode == "trio" and trio and config[i]["run_purge_dups"]==False: NAME.append(i) - elif trio == False and config[i]["purge"]==False: + elif trio == False and config[i]["run_purge_dups"]==False: NAME.append(i) return(NAME) -- GitLab From 3dd64bd7df5d534dc951c8dc392ac1732dcf8d75 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 16 Oct 2024 09:52:41 +0200 Subject: [PATCH 035/178] correct variable typing --- workflow/rules/07_report.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/rules/07_report.smk b/workflow/rules/07_report.smk index 18ee0ed..2f7ed89 100644 --- a/workflow/rules/07_report.smk +++ b/workflow/rules/07_report.smk @@ -44,7 +44,7 @@ rule report: mode=get_mode, run=get_run, purge=get_purge, - purge_force = str(get_purge_force), + purge_force = get_purge_force, container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: @@ -86,7 +86,7 @@ rule no_purge_report: mode=get_mode, run=get_run, purge=get_purge, - purge_force = str(get_purge_force) + purge_force = get_purge_force container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: @@ -145,7 +145,7 @@ rule report_trio: p2=get_p2, run=get_run, purge=get_purge, - purge_force = str(get_purge_force) + purge_force = get_purge_force container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: @@ -189,7 +189,7 @@ rule no_purge_report_trio: p2=get_p2, run=get_run, purge=get_purge, - purge_force = str(get_purge_force) + purge_force = get_purge_force container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: -- GitLab From fcfe03946dfef35ef7a3f85db12e1c5b48061c00 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 21 Oct 2024 13:46:46 +0200 Subject: [PATCH 036/178] update readme for new options --- workflow/doc/Going-further.md | 145 ++++++++++------------------------ workflow/doc/Outputs.md | 3 +- workflow/doc/Programs.md | 21 +++-- workflow/doc/Quick-start.md | 7 +- 4 files changed, 63 insertions(+), 113 deletions(-) diff --git a/workflow/doc/Going-further.md b/workflow/doc/Going-further.md index 7d27767..b2602e1 100644 --- a/workflow/doc/Going-further.md +++ b/workflow/doc/Going-further.md @@ -2,129 +2,70 @@ [TOC] -## 1. Multiple datasets -You can run the workflow on multiple datasets at the same time. +## 01. In-depth options +### Job.sh options -### 1.1. All datasets -With `masterconfig.yaml` as follow, running the workflow will assemble each dataset in its specific assembly mode. -You can add as many datasets as you want, each with different parameters. +For a dry run +```bash +sbatch job.sh dry +``` +If tou want to output the DAG of the workflow +```bash +sbatch job.sh dag +``` +To run the workflow +```bash +sbatch job.sh +``` +## Workflow options +Inside the ./.config/marsterconfig.yaml file you can add more options ```yaml -IDS: ["toy_dataset", "toy_dataset_hi-c", "toy_dataset_trio"] - -toy_dataset: - fasta: "./GenomAsm4pg/tutorial_data/toy_dataset.fasta" - run: tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: default - -toy_dataset_hi-c: - fasta: ./GenomAsm4pg/tutorial_data/hi-c/toy_dataset_hi-c.fasta - run: hi-c_tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: hi-c - r1: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta - r2: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta +IDS: ["example1"] -toy_dataset_trio: - fasta: ./GenomAsm4pg/tutorial_data/trio/toy_dataset_trio.fasta - run: trio_tutorial +example1: + fasta: ./GenomAsm4pg/tutorial_data.fasta + run: example_run ploidy: 2 busco_lineage: eudicots_odb10 - mode: trio - p1: ./GenomAsm4pg/tutorial_data/trio/data_p1.fasta - p2: ./GenomAsm4pg/tutorial_data/trio/data_p2.fasta + assembly_purge_force: 3 + run_purge_dups : True + mode: default ``` -### 1.2. On chosen datasets -You can remove dataset from IDS to assemble only chosen genomes: -```yaml -IDS: ["toy_dataset", "toy_dataset_trio"] -toy_dataset: - fasta: "./GenomAsm4pg/tutorial_data/toy_dataset.fasta" - run: tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: default +- `fasta` : Your reads +- `run` : The run name +- `ploidy` : The ploidy of the organims +- `busco_lineage` : The busco lineage of your organisms listed [here](https://busco.ezlab.org/list_of_lineages.html) +- `assembly_purge_force` : [1-3] the purge level of Hifiasm `-l` parametter, full description [here](https://hifiasm.readthedocs.io/en/latest/parameter-reference.html) default is set to 3 +- `run_purge_dups` : [True, False] If set to true, the workflow will run [purge_dups](https://github.com/dfguan/purge_dups) on the assembly and rerun all the metrics. Default is set to False. Note that truning on this option will more tan double the runing time of the workflow. +- `mode`: [default, hi-c, trio] See [Hi-C assembly mode tutorial](Assembly-Mode/Hi-C-tutorial.md) or the [Trio assembly mode tutorial](Assembly-Mode/Trio-tutorial.md) -toy_dataset_hi-c: - fasta: ./GenomAsm4pg/tutorial_data/hi-c/toy_dataset_hi-c.fasta - run: hi-c_tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: hi-c - r1: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta - r2: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta - -toy_dataset_trio: - fasta: ./GenomAsm4pg/tutorial_data/trio/toy_dataset_trio.fasta - run: trio_tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: trio - p1: ./GenomAsm4pg/tutorial_data/trio/data_p1.fasta - p2: ./GenomAsm4pg/tutorial_data/trio/data_p2.fasta -``` -Running the workflow with this config will assemble only `toy_dataset` and `toy_dataset_trio`. -## 2. Different run names -If you want to try different parameters on the same dataset, changing the run name will create a new directory and keep the previous data. +## 2. Run the workflow on multiple datasets +You can run the workflow on multiple datasets at the same time. -In the [Hi-C tutorial](), we used the following config. ```yaml -IDS: ["toy_dataset_hi-c"] +IDS: ["toy_dataset", "purge_dataset", "toy_dataset_hi-c", "toy_dataset_trio"] -toy_dataset_hi-c: - fasta: ./GenomAsm4pg/tutorial_data/hi-c/toy_dataset_hi-c.fasta - run: hi-c_tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: hi-c - r1: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta - r2: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta -``` - -If you want to compare the Hi-C and default assembly modes, you can run the workflow with a different run name and the default mode. -```yaml -IDS: ["toy_dataset_hi-c"] +toy_dataset: + ... toy_dataset_hi-c: - fasta: ./GenomAsm4pg/tutorial_data/hi-c/toy_dataset_hi-c.fasta - run: default_comparaison - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: default -``` -You will end up with 2 sub-directories for toy_dataset_hi-c (`hi-c_tutorial` and `default_comparaison`) and keep the data from the previous run in Hi-C mode. + ... -## 3. The same dataset with different parameters at once -If you want to do the previous example in one run, you will have to create a symbolic link to the fasta with a different filename. +toy_dataset_trio: + ... +``` -YAML files do not allow multiple uses of the same key. The following config does not work. +You can remove dataset from IDS to assemble only chosen genomes: ```yaml -## DOES NOT WORK -IDS: ["toy_dataset_hi-c"] - -toy_dataset_hi-c: - run: hi-c_tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: hi-c - r1: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta - r2: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta - -toy_dataset_hi-c: - run: default_comparaison - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: default +IDS: ["toy_dataset", "toy_dataset_trio"] ``` +Running the workflow with this config will assemble only `toy_dataset` and `toy_dataset_trio`. -**TO COMPLETE** -## 4. Optional fastq and bam files +## 3. Optional fastq and bam files If fastq and bam are available and you want to do raw QC with fastQC and longQC, add the `fastq` and/or `bam` key in your config. The fasta, fastq and bam filenames have to be the same. For example: ```yaml diff --git a/workflow/doc/Outputs.md b/workflow/doc/Outputs.md index e91b46a..6825893 100644 --- a/workflow/doc/Outputs.md +++ b/workflow/doc/Outputs.md @@ -29,7 +29,7 @@ workflow_results | ├── katplot | ├── merqury | └── telomeres - └── 02_after_purge_dups_assembly + └── 02_after_purge_dups_assembly (optional) ├── 00_assembly | ├── hap1 | └── hap2 @@ -45,3 +45,4 @@ workflow_results - Symbolic links to haplotype 1 and haplotype 2 assemblies after purge_dups - HTML report with the main results from each program - Runtime file with the total workflow runtime for the dataset +- Global QUAST report diff --git a/workflow/doc/Programs.md b/workflow/doc/Programs.md index 80e0469..2b975a7 100644 --- a/workflow/doc/Programs.md +++ b/workflow/doc/Programs.md @@ -8,7 +8,7 @@ Images are stored on the project's container registry but come from various cont - Fastq to fasta conversion - [seqtk](https://github.com/lh3/seqtk) 1.3 - Raw data quality control - - [fastqc](https://github.com/s-andrews/FastQC) 0.11.5 + - [fastqc](https://github.com/s-andrews/FastQC) 0.12.1 - [lonqQC](https://github.com/yfukasawa/LongQC) 1.2.0c - Metrics - [genometools](https://github.com/genometools/genometools) 1.5.9 @@ -18,11 +18,12 @@ Images are stored on the project's container registry but come from various cont ## 2. Assembly - Assembly - - [hifiasm](https://github.com/chhylp123/hifiasm) 0.16.1 + - [hifiasm](https://github.com/chhylp123/hifiasm) 0.19.6 + - [YAK](https://github.com/lh3/yak) 0.1 - Metrics - [genometools](https://github.com/genometools/genometools) 1.5.9 - Assembly quality control - - [BUSCO](https://gitlab.com/ezlab/busco) 5.3.1 + - [BUSCO](https://gitlab.com/ezlab/busco) 5.7.1 - [KAT](https://github.com/TGAC/KAT) 2.4.1 - Error rate, QV & phasing - [meryl](https://github.com/marbl/meryl) and [merqury](https://github.com/marbl/merqury) 1.3 @@ -32,9 +33,13 @@ Images are stored on the project's container registry but come from various cont - Haplotigs and overlaps purging - [purge_dups](https://github.com/dfguan/purge_dups) 1.2.5 - **matplotlib** 0.11.5 +- Repeted elements quantification + - [LTR_retriever](https://github.com/oushujun/LTR_retriever) 3.0.1 + - [LTR_Finder](https://github.com/xzhub/LTR_Finder) latest as of october 2024 ## 3. Report - **R markdown** 4.0.3 +- [QUAST](https://github.com/ablab/quast) 5.2.0 # Docker images The programs are pulled automatically as images by Snakemake the first time you run the workflow. It may take some time. Images are only downloaded once and reused automatically by the workflow. @@ -42,16 +47,20 @@ Images are stored on the project's container registry but come from various cont - [smrtlink](https://hub.docker.com/r/bryce911/smrtlink/tags) - [seqtk](https://hub.docker.com/r/nanozoo/seqtk) -- [fastqc](https://hub.docker.com/r/biocontainers/fastqc/tags) +- [fastqc](https://hub.docker.com/r/staphb/fastqc/tags) - [lonqQC](https://hub.docker.com/r/grpiccoli/longqc/tags) - [genometools](https://hub.docker.com/r/biocontainers/genometools/tags) - [jellyfish](https://quay.io/repository/biocontainers/kmer-jellyfish?tab=tags) - [genomescope](https://hub.docker.com/r/abner12/genomescope) -- [hifiasm](https://quay.io/repository/biocontainers/hifiasm?tab=tags) +- hifiasm, custom +- yak, custom - [BUSCO](https://hub.docker.com/r/ezlabgva/busco/tags) - [KAT](https://quay.io/repository/biocontainers/kat) - [meryl and merqury](https://quay.io/repository/biocontainers/merqury?tab=tags) - [Biopython for FindTelomeres](https://quay.io/repository/biocontainers/biopython?tab=tags) -- [purge_dups](https://quay.io/repository/biocontainers/purge_dups?tab=tags) +- [purge_dups](https://hub.docker.com/r/wangnan9394/purge_dups/tags) - [matplotlib as companion to purge_dups](https://hub.docker.com/r/biocontainers/matplotlib-venn/tags) - [R markdown](https://hub.docker.com/r/reslp/rmarkdown/tags) +- LTR_retriever +- LTR_Finder custom +- [QUAST](https://hub.docker.com/r/staphb/quast/tags) \ No newline at end of file diff --git a/workflow/doc/Quick-start.md b/workflow/doc/Quick-start.md index 8e55591..548b5c2 100644 --- a/workflow/doc/Quick-start.md +++ b/workflow/doc/Quick-start.md @@ -39,7 +39,7 @@ toy_dataset: fasta: "./GenomAsm4pg/tutorial_data/toy_dataset.fasta" run: tutorial ploidy: 2 - busco_lineage: eudicots_odb10 + busco_lineage: eudicots_odb10 mode: default ``` ## 2. Addapt the scripts to your HPC @@ -51,6 +51,7 @@ The current profile is configured for SLURM. If you use SLURM, change line 13 to To run this workflow on another HPC, create a new profile (https://github.com/Snakemake-Profiles) and add it to the .config/snakemake_profile directory. Update the CLUSTER_CONFIG and PROFILE variables in the job.sh and prejob.sh scripts. If your cluster doesn’t have Singularity enabled by default, add it to the list of modules to load in job.sh. + ## 3. Dry run To check the configuration, first perform a dry run of the workflow: ```bash @@ -64,6 +65,4 @@ Then, run the script: sbatch job.sh ``` ## Other assembly modes -If you want to use additional Hi-C data or parental data, follow the [Hi-C assembly mode tutorial](Assembly-Mode/Hi-C-tutorial.md) or the [Trio assembly mode tutorial](Assembly-Mode/Trio-tutorial.md). To go further with the workflow use go [here](Going-further.md). - -**TO-DO : add a toy fasta.** \ No newline at end of file +If you want to use additional Hi-C data or parental data, follow the [Hi-C assembly mode tutorial](Assembly-Mode/Hi-C-tutorial.md) or the [Trio assembly mode tutorial](Assembly-Mode/Trio-tutorial.md). To go further with the workflow use go [here](Going-further.md). \ No newline at end of file -- GitLab From 4d379f6140c9f99fce5cda215af21110cb44fcfd Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 21 Oct 2024 17:53:30 +0200 Subject: [PATCH 037/178] Add quast to the workflow --- .config/masterconfig.yaml | 11 +++++---- workflow/Snakefile | 7 +++++- workflow/rules/08_QUAST.smk | 17 ++++++++++++++ workflow/scripts/from_config/target_list.py | 9 +++++++ workflow/scripts/path_helper.py | 26 ++++++++++++++++++++- 5 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 workflow/rules/08_QUAST.smk diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index c1769f4..df135c3 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -9,11 +9,6 @@ get_all_tar_filename: False tarIDS: "<tar_filenames>" ####################### job - workflow ####################### -# number of threads used by pigz -pigz_threads: 4 -# k-mers size (reduce for small datasets) -km_size: 21 #Todo, add support for modularity of this value - ### CONFIG IDS: ["purge", "no_purge"] @@ -35,6 +30,9 @@ no_purge: run_purge_dups : False mode: default +reference_genome : "/mnt/cbib/pangenoak_trials/GenomAsm4pg/my_reference.fasta.gz" +run_quast: True + ####################### workflow output directories ####################### # results directory resdir: results @@ -60,3 +58,6 @@ asm_purged: 02_after_purge_dups_assembly asm_conta: 03_uncontaminated_assembly asm: 00_assembly asm_qc: 01_assembly_QC + +# number of threads used by pigz +pigz_threads: 4 \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 6135456..de2a554 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -25,6 +25,7 @@ include: "rules/05.5_purged_asm_qc_merqury.smk" include: "rules/06_sym_link_hap.smk" include: "rules/07_report.smk" include: "rules/00_runtime.smk" +include: "rules/08_QUAST.smk" # Get the filenames of inputs IDS=config["IDS"] @@ -105,6 +106,9 @@ time_purge = expand(res_path + "/{runid}/p_runtime.{id}.{lin}.txt", zip, time_trio_purge = expand(res_path + "/{runid}/p_runtime_trio.{id}.{lin}.txt", zip, runid = RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin=BUSCO_LIN_TRIO) +# QUAST +quast = check_quast(res_path) + rule_all_input_list = [ longqc_output, fastqc_output, @@ -123,7 +127,8 @@ rule_all_input_list = [ time_trio_purge, time_purge, purged_report_output, - purged_report_trio_output + purged_report_trio_output, + quast ] #### target files diff --git a/workflow/rules/08_QUAST.smk b/workflow/rules/08_QUAST.smk new file mode 100644 index 0000000..d2f1ff0 --- /dev/null +++ b/workflow/rules/08_QUAST.smk @@ -0,0 +1,17 @@ + +# Rule to run quast on all the assembled genomes +assemblies = find_all_assemblies() + +rule QUAST: + params: + assemblies=assemblies, + output_folder= res_path + "/global_quast_report/" + output: + res_path + "/global_quast_report/report.html" + container: + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/staphb/quast:5.2.0" + shell: + """ + python /quast-5.2.0/quast-lg.py {params.assemblies} -o {params.output_folder} && + rm -rf {params.output_folder}/contigs_reports/ + """ \ No newline at end of file diff --git a/workflow/scripts/from_config/target_list.py b/workflow/scripts/from_config/target_list.py index 4cb73b5..ec87c79 100644 --- a/workflow/scripts/from_config/target_list.py +++ b/workflow/scripts/from_config/target_list.py @@ -65,3 +65,12 @@ def check_fastq(id_list): if "fastq" in config[i]: IDS.append(i) return(IDS) + +# QUAST +def check_quast(res_path): + """ + Check if run_quast is set to true in the masterconfig + """ + if config["run_quast"]: + return res_path + "/global_quast_report/report.html" + return [] \ No newline at end of file diff --git a/workflow/scripts/path_helper.py b/workflow/scripts/path_helper.py index 1aa7e2e..2e3a9ee 100644 --- a/workflow/scripts/path_helper.py +++ b/workflow/scripts/path_helper.py @@ -10,4 +10,28 @@ def get_abs_root_path(): def get_res_path(): abs_root_path = os.path.abspath(config["root"]) res_path= abs_root_path + "/" + config["resdir"] - return(res_path) \ No newline at end of file + return(res_path) + +def find_all_assemblies(): + """ + Output all the asembled genomes in the results directories + """ + res_path = get_res_path() + assemblies = [] + for root, _, files in os.walk(res_path): + for file in files: + if file.endswith(".fa.gz"): + assemblies.append(os.path.join(root, file)) + return assemblies + +def find_all_assemblies(): + """ + Output all the asembled genomes in the results directories + """ + res_path = get_res_path() + assemblies = [] + for root, _, files in os.walk(res_path): + for file in files: + if file.endswith(".fa.gz"): + assemblies.append(os.path.join(root, file)) + return assemblies \ No newline at end of file -- GitLab From 03dd89a722a6a3e2c580aa425ba56261f8d2331b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 6 Jan 2025 15:48:42 +0100 Subject: [PATCH 038/178] update default parameters --- workflow/scripts/from_config/parameters.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/workflow/scripts/from_config/parameters.py b/workflow/scripts/from_config/parameters.py index 4487474..8745d4e 100644 --- a/workflow/scripts/from_config/parameters.py +++ b/workflow/scripts/from_config/parameters.py @@ -7,11 +7,14 @@ def get_busco_lin(wildcards): lineage = config[f'{id_name}']["busco_lineage"] return(lineage) -#### PLOIDY +#### Ploidy def get_ploidy(wildcards): id_name = wildcards.id - ploidy = config[f'{id_name}']["ploidy"] - return(ploidy) + try : + ploidy = config[f'{id_name}']["ploidy"] + except KeyError: + return 2 + return ploidy #### RUN NAME def get_run(wildcards): @@ -40,11 +43,17 @@ def get_bam(wildcards): # Fetch the purge mode, return a boolean from config file def get_purge(wildcards): id_name = wildcards.id - purge_bool = config[f'{id_name}']["run_purge_dups"] + try : + purge_bool = config[f'{id_name}']["run_purge_dups"] + except KeyError: + return False return purge_bool -# Fetch the purge level for hifiasm, return a boolean from config file +# Fetch the purge level for hifiasm def get_purge_force(wildcards): id_name = wildcards.id - force = config[f'{id_name}']["assembly_purge_force"] + try : + force = config[f'{id_name}']["assembly_purge_force"] + except KeyError: + return '3' return force \ No newline at end of file -- GitLab From 4df0a0dc07e0f84793997de5adf43b0fddabadca Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 6 Jan 2025 15:52:46 +0100 Subject: [PATCH 039/178] prepare for ragtag option --- .config/masterconfig.yaml | 14 +++----------- workflow/Snakefile | 7 +------ workflow/doc/Going-further.md | 3 +++ workflow/rules/03_asm_qc.smk | 3 +-- workflow/rules/05_purged_asm_qc.smk | 1 - workflow/rules/08_QUAST.smk | 17 ----------------- workflow/scripts/path_helper.py | 14 +++----------- 7 files changed, 11 insertions(+), 48 deletions(-) delete mode 100644 workflow/rules/08_QUAST.smk diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index df135c3..99fa404 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -9,6 +9,8 @@ get_all_tar_filename: False tarIDS: "<tar_filenames>" ####################### job - workflow ####################### + + ### CONFIG IDS: ["purge", "no_purge"] @@ -17,21 +19,11 @@ purge: fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/Q_alba.fasta.gz" ploidy: 2 busco_lineage: eudicots_odb10 - assembly_purge_force: 3 - run_purge_dups : True - mode: default - -no_purge: - run: no_purge - fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/Q_alba.fasta.gz" - ploidy: 2 - busco_lineage: eudicots_odb10 - assembly_purge_force: 1 - run_purge_dups : False mode: default reference_genome : "/mnt/cbib/pangenoak_trials/GenomAsm4pg/my_reference.fasta.gz" run_quast: True +run_ragtag: True ####################### workflow output directories ####################### # results directory diff --git a/workflow/Snakefile b/workflow/Snakefile index de2a554..6135456 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -25,7 +25,6 @@ include: "rules/05.5_purged_asm_qc_merqury.smk" include: "rules/06_sym_link_hap.smk" include: "rules/07_report.smk" include: "rules/00_runtime.smk" -include: "rules/08_QUAST.smk" # Get the filenames of inputs IDS=config["IDS"] @@ -106,9 +105,6 @@ time_purge = expand(res_path + "/{runid}/p_runtime.{id}.{lin}.txt", zip, time_trio_purge = expand(res_path + "/{runid}/p_runtime_trio.{id}.{lin}.txt", zip, runid = RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin=BUSCO_LIN_TRIO) -# QUAST -quast = check_quast(res_path) - rule_all_input_list = [ longqc_output, fastqc_output, @@ -127,8 +123,7 @@ rule_all_input_list = [ time_trio_purge, time_purge, purged_report_output, - purged_report_trio_output, - quast + purged_report_trio_output ] #### target files diff --git a/workflow/doc/Going-further.md b/workflow/doc/Going-further.md index b2602e1..c508e49 100644 --- a/workflow/doc/Going-further.md +++ b/workflow/doc/Going-further.md @@ -80,3 +80,6 @@ toy_dataset: busco_lineage: eudicots_odb10 mode: default ``` + +## 4. Add a reference genome +You can add a reference genome to the `.masterconfig` and set `scafold_output` to True to run ragtag on your output. \ No newline at end of file diff --git a/workflow/rules/03_asm_qc.smk b/workflow/rules/03_asm_qc.smk index ca712f9..edae4ec 100644 --- a/workflow/rules/03_asm_qc.smk +++ b/workflow/rules/03_asm_qc.smk @@ -49,12 +49,11 @@ rule kat: params: prefix="{id}_hap{n}", path=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/katplot/hap{n}/{id}_hap{n}", - km_size = config["km_size"] threads: 4 container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/kat2.4.1" shell: - "kat comp -o {params.path} -t {threads} -m {params.km_size} --output_type png -v {input.jellyfish} {input.hap} && " + "kat comp -o {params.path} -t {threads} -m 21--output_type png -v {input.jellyfish} {input.hap} && " "kat plot spectra-cn -x 200 -o {params.path}.katplot.png {params.path}-main.mx" # telomeres diff --git a/workflow/rules/05_purged_asm_qc.smk b/workflow/rules/05_purged_asm_qc.smk index 945c382..9ce446f 100644 --- a/workflow/rules/05_purged_asm_qc.smk +++ b/workflow/rules/05_purged_asm_qc.smk @@ -30,7 +30,6 @@ use rule kat as purge_kat with: output: res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/katplot/hap{n}/{id}_purged_hap{n}.katplot.png" params: - km_size = config["km_size"], prefix="{id}_hap{n}", path= res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/katplot/hap{n}/{id}_purged_hap{n}" diff --git a/workflow/rules/08_QUAST.smk b/workflow/rules/08_QUAST.smk deleted file mode 100644 index d2f1ff0..0000000 --- a/workflow/rules/08_QUAST.smk +++ /dev/null @@ -1,17 +0,0 @@ - -# Rule to run quast on all the assembled genomes -assemblies = find_all_assemblies() - -rule QUAST: - params: - assemblies=assemblies, - output_folder= res_path + "/global_quast_report/" - output: - res_path + "/global_quast_report/report.html" - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/staphb/quast:5.2.0" - shell: - """ - python /quast-5.2.0/quast-lg.py {params.assemblies} -o {params.output_folder} && - rm -rf {params.output_folder}/contigs_reports/ - """ \ No newline at end of file diff --git a/workflow/scripts/path_helper.py b/workflow/scripts/path_helper.py index 2e3a9ee..e197410 100644 --- a/workflow/scripts/path_helper.py +++ b/workflow/scripts/path_helper.py @@ -24,14 +24,6 @@ def find_all_assemblies(): assemblies.append(os.path.join(root, file)) return assemblies -def find_all_assemblies(): - """ - Output all the asembled genomes in the results directories - """ - res_path = get_res_path() - assemblies = [] - for root, _, files in os.walk(res_path): - for file in files: - if file.endswith(".fa.gz"): - assemblies.append(os.path.join(root, file)) - return assemblies \ No newline at end of file +def get_ref(): + ref_path = os.path.abspath(config["reference_genome"]) + return(ref_path) \ No newline at end of file -- GitLab From 83b2aa5452715da746386a340da5b7905abbe12f Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 6 Jan 2025 16:53:50 +0100 Subject: [PATCH 040/178] remove fuss --- .gitignore | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) diff --git a/.gitignore b/.gitignore index 6a13d23..0ce72b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,42 +1,5 @@ -# In this gitignore, we want to set up a white list (usually gitignore works as - # a black list). So we first exclude all files from tracking and then include - # only the file patterns we are interested in (i.e. we don't want big fastq - # or bam files to be synchronized on our git remote server). - - -# 1) First, exclude files from git tracking - # exclude ALL files from tracking using the '*' wildcard - # Note that we could be more more specific here but that might be risky as - # one cannot necessarily think about all files contained in a folder that - # should not be sync (especially sometimes remanent huge temp files - # remaining from aborted analysis run) -* - -# 2) Second, set exceptions to be tracked by git: - # thanks to the '!' symbol, we can track files with specific patterns - # even though they would be listed in section 1 (as a matter of fact, - # in the present case, any possible file is listed in section 1 since - # the pattern is the '*' wildcard). - # Note that the '#' comments anything on its rights, meaning that patterns - # below are still effectively ignored from git tracking. - # If you want to (un)track a given file, just comment toggle the respective line. - # of course, one can add new patterns at will. -!*.csv -#!*.cfg -!*.dummy -#!*.err -#!*.error -#!*.errors !.gitignore -#!*.html -!*.jl -!.keep -#!*Makefile* !*.md -#!*.ods -#!*.odt -#!*.pptx -!*.pl !*.py !*README* !*.R @@ -44,22 +7,12 @@ !*.r !*.rmd !*.sh -#!*.template -#!*.txt !*.yml !*.yaml !Snakefile !*.smk !*.svg - - -# 3) add a pattern to track the file patterns of section2 even if they are in - # subdirectories !*/ node_modules node_modules/* - -# 4) specific files or folder to TRACK (the '**' sign means 'any path') - -# 5) specific folders to UNTRACK (wherever it may be in the treeview) .snakemake/* -- GitLab From 134f38e7b697690f8fab722de0cb0ba951cf24a2 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 6 Jan 2025 16:54:20 +0100 Subject: [PATCH 041/178] remove unused CICD --- .gitlab-ci.yml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 2166f91..0000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,5 +0,0 @@ -test_job: - stage: test - image: continuumio/miniconda3 - script: - - echo "Temporarily clear this file to reduce computational load on every push." -- GitLab From b05fdf15d84347a73eb7a3e7cad21477f0f7116b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 6 Jan 2025 16:54:36 +0100 Subject: [PATCH 042/178] clean file --- prejob.sh | 67 ------------------------------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 prejob.sh diff --git a/prejob.sh b/prejob.sh deleted file mode 100644 index 16eece8..0000000 --- a/prejob.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -################################ Slurm options ################################# -### prepare_calling_jobs -#SBATCH -J smk_prejob -### Max run time "hours:minutes:seconds" -#SBATCH --time=96:00:00 -#SBATCH --ntasks=1 #nb of processes -#SBATCH --cpus-per-task=1 # nb of cores for each process(1 process) -#SBATCH --mem=10G # max of memory (-m) -### Requirements nodes/servers (default: 1) -#SBATCH --nodes=1 -### Requirements cpu/core/task (default: 1) -#SBATCH --ntasks-per-node=1 -#SBATCH -o slurm_logs/snakemake_prejob.%N.%j.out -#SBATCH -e slurm_logs/snakemake_prejob.%N.%j.err -#SBATCH --mail-type=END,FAIL -#SBATCH --mail-user=lucien.piat@inrae.fr -################################################################################ - -# Useful information to print -echo '########################################' -echo 'Date:' $(date --iso-8601=seconds) -echo 'User:' $USER -echo 'Host:' $HOSTNAME -echo 'Job Name:' $SLURM_JOB_NAME -echo 'Job ID:' $SLURM_JOB_ID -echo 'Number of nodes assigned to job:' $SLURM_JOB_NUM_NODES -echo 'Total number of cores for job (?):' $SLURM_NTASKS -echo 'Number of requested cores per node:' $SLURM_NTASKS_PER_NODE -echo 'Nodes assigned to job:' $SLURM_JOB_NODELIST -echo 'Number of CPUs assigned for each task:' $SLURM_CPUS_PER_TASK -echo 'Directory:' $(pwd) -# Detail Information: -echo 'scontrol show job:' -scontrol show job $SLURM_JOB_ID -echo '########################################' - - -# Function to load modules -load_modules() { - module purge # Clear any previously loaded modules - - # Loop through each module and load it - for module_name in "$@"; do - module load "$module_name" - done -} -load_modules "python/3.9.7" "snakemake/6.5.1" - -### variable -SNG_BIND="/mnt/cbib/pangenoak_trials/GenomeAsm4pg/" -CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" -MAX_CORES=4 -PROFILE=".config/snakemake_profile/slurm" -SMK_PATH="workflow/pre-job_snakefiles" - -echo 'Starting Snakemake - data preparation' - -### Snakemake commands -# extract data -snakemake -s $SMK_PATH/Snakefile1.smk --profile $PROFILE -j $MAX_CORES --cluster-config $CLUSTER_CONFIG -# move files -snakemake -s $SMK_PATH/Snakefile1.smk --profile $PROFILE -j $MAX_CORES --cluster-config $CLUSTER_CONFIG -R move_files -# smrtlink on bam data -snakemake -s $SMK_PATH/Snakefile2.smk --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -# convert fastq to fasta when necessary -snakemake -s $SMK_PATH/Snakefile3.smk --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -- GitLab From 8b08225dd61d49459722e37f2cca78dd899185d8 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 6 Jan 2025 17:04:52 +0100 Subject: [PATCH 043/178] add a local run file --- local_run.sh | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100755 local_run.sh diff --git a/local_run.sh b/local_run.sh new file mode 100755 index 0000000..3c6e16f --- /dev/null +++ b/local_run.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Script to run locally, DO NOT USE AS IS ON A CLUSTER! + +SNG_BIND=$(pwd) + +# Get the number of CPU cores dynamically +CORES=$(nproc) + +run_snakemake() { + + local option="$2" # The option for dry run or DAG + echo "Starting $snakefile..." + + # Execute the Snakemake command with the specified option + if [[ "$option" == "dry" ]]; then + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -n + elif [[ "$option" == "dag" ]]; then + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES --dag > dag.dot + echo "DAG has been generated as dag.png" + return + else + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES --forceall + fi + + # Check if the Snakemake command was successful + if [ $? -eq 0 ]; then + echo "$snakefile completed successfully." + else + echo "Error: $snakefile failed." + exit 1 + fi +} + +if [ $# -eq 0 ]; then + echo "Usage: $0 [dry|dag|run]" + echo " dry - run the specified Snakefile in dry-run mode" + echo " dag - generate DAG for the specified Snakefile" + echo " run - run the specified Snakefile normally (default)" + exit 1 +fi + +run_snakemake "$option" -- GitLab From 99888ceecc297f8ba31777e17c3f4890696226fc Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 6 Jan 2025 17:05:13 +0100 Subject: [PATCH 044/178] cleanup the config file, remove unecessary options --- .config/masterconfig.yaml | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 99fa404..2e9f44f 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,33 +1,24 @@ -# absolute path to your desired output path -root: /mnt/cbib/pangenoak_trials/GenomAsm4pg +# Config file -####################### optional prejob - data preparation ####################### -# path to tar data -data: /mnt/cbib/pangenoak_trails/GenomAsm4pg -# list of tar names -get_all_tar_filename: False -tarIDS: "<tar_filenames>" +samples: + test_sample1: + fasta_gz: "input_reads.fa.gz" + run: purge + fasta: "Q_alba.fasta.gz" + busco_lineage: eudicots_odb10 + mode: default -####################### job - workflow ####################### +reference_genome : "my_reference.fasta.gz" +run_quast: True +run_ragtag: True +memory_multiplier: 100 # Multiplier for scaling memory across all rules +container_registry: "docker://registry.forgemia.inra.fr/pangepop/mspangepop" +output_dir: "results/" -### CONFIG -IDS: ["purge", "no_purge"] -purge: - run: purge - fasta: "/mnt/cbib/pangenoak_trials/GenomAsm4pg/Q_alba.fasta.gz" - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: default -reference_genome : "/mnt/cbib/pangenoak_trials/GenomAsm4pg/my_reference.fasta.gz" -run_quast: True -run_ragtag: True -####################### workflow output directories ####################### -# results directory -resdir: results ### PREJOB # extracted input data -- GitLab From 013e1a47e000e33f36dfe824ea2dc1832ae2892e Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:34:21 +0100 Subject: [PATCH 045/178] remove all results files --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0ce72b3..14ed260 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +* !.gitignore !*.md !*.py @@ -15,4 +16,5 @@ !*/ node_modules node_modules/* -.snakemake/* +!slurm_logs/* +!workflow/* \ No newline at end of file -- GitLab From 5c4edb5747b260dca8d13728a329906a46c48ecd Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:34:52 +0100 Subject: [PATCH 046/178] Add helper script to retrive config information --- workflow/scripts/parameter_retrieval.py | 26 +++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 workflow/scripts/parameter_retrieval.py diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py new file mode 100644 index 0000000..1b70a38 --- /dev/null +++ b/workflow/scripts/parameter_retrieval.py @@ -0,0 +1,26 @@ +from snakemake.io import expand +# Used to retrive the parameters for rules + +# Fetch the purge level for hifiasm +def get_purge_force(wildcards): + try : + force = config["samples"][wildcards.sample]["assembly_purge_force"] + except KeyError: + print('No "assembly_purge_force" specified, using l3 by default') + return '3' + return force + +def get_mode(wildcards): + try : + mode = config["samples"][wildcards.sample]["mode"] + except KeyError: + print('No "mode" specified, using default') + return 'default' + return mode + +def get_run(wildcards, run:int): + try : + run= config["samples"][wildcards.sample][f"r{run}"] + except KeyError: + return 'None' + return run \ No newline at end of file -- GitLab From ace23b97a38b93e536018e36894fb189da4f6e1f Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:35:01 +0100 Subject: [PATCH 047/178] add a TODO --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index e8dd7a8..8542698 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +# TODO, ADAPT THIS OBSOLOETE README + # <A HREF="https://forgemia.inra.fr/asm4pg/GenomAsm4pg"> asm4pg </A> An automatic and reproducible genome assembly workflow for pangenomic applications using PacBio HiFi data. -- GitLab From a9a8a5d99dbeb5c316570559c5e5705637cd17d9 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:35:24 +0100 Subject: [PATCH 048/178] adapt the local run file from MSpangepop --- local_run.sh | 55 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/local_run.sh b/local_run.sh index 3c6e16f..c26b9f6 100755 --- a/local_run.sh +++ b/local_run.sh @@ -1,42 +1,55 @@ #!/bin/bash # Script to run locally, DO NOT USE AS IS ON A CLUSTER! -SNG_BIND=$(pwd) +# Written by Lucien Piat at INRAe +# 07/01/24 -# Get the number of CPU cores dynamically +SNG_BIND=$(pwd) CORES=$(nproc) run_snakemake() { - - local option="$2" # The option for dry run or DAG - echo "Starting $snakefile..." - - # Execute the Snakemake command with the specified option - if [[ "$option" == "dry" ]]; then - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -n - elif [[ "$option" == "dag" ]]; then - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES --dag > dag.dot - echo "DAG has been generated as dag.png" - return - else - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES --forceall - fi + local option="$1" + + case "$option" in + dry) + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -n + ;; + dag) + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES --dag > dag.dot + if [ $? -eq 0 ]; then + echo "DAG has been successfully generated as dag.dot" + else + echo "Error: Failed to generate DAG." + exit 1 + fi + ;; + run) + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES #--forceall + ;; + *) + echo "Invalid option: $option" + echo "Usage: $0 [dry|dag|run]" + exit 1 + ;; + esac # Check if the Snakemake command was successful if [ $? -eq 0 ]; then - echo "$snakefile completed successfully." + echo "Snakemake completed successfully." else - echo "Error: $snakefile failed." + echo "Error: Snakemake execution failed." exit 1 fi } -if [ $# -eq 0 ]; then +# Verify arguments +if [ $# -ne 1 ]; then echo "Usage: $0 [dry|dag|run]" echo " dry - run the specified Snakefile in dry-run mode" echo " dag - generate DAG for the specified Snakefile" - echo " run - run the specified Snakefile normally (default)" + echo " run - run the specified Snakefile normally" exit 1 fi -run_snakemake "$option" +# Execute the function with the provided option +run_snakemake "$1" -- GitLab From 2cefc09e98521bc51f7dcc4bee27a0becd32ea89 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:36:01 +0100 Subject: [PATCH 049/178] Add two test run --- .config/masterconfig.yaml | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 2e9f44f..e639ff0 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,31 +1,26 @@ # Config file samples: - test_sample1: - fasta_gz: "input_reads.fa.gz" - run: purge - fasta: "Q_alba.fasta.gz" - busco_lineage: eudicots_odb10 - mode: default + test_default: + fasta_gz: small_example.fasta.gz + busco_lineage: eudicots_odb10 + test_hi-c: + fasta_gz: small_example.fasta.gz + mode: hi-c + r1: small_example.fasta.gz + r2: small_example.fasta.gz + assembly_purge_force: 2 + busco_lineage: eudicots_odb10 + reference_genome : "my_reference.fasta.gz" run_quast: True run_ragtag: True -memory_multiplier: 100 # Multiplier for scaling memory across all rules -container_registry: "docker://registry.forgemia.inra.fr/pangepop/mspangepop" +container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" output_dir: "results/" - - - -### PREJOB -# extracted input data -rawdir: 00_input_data -bamdir: 00_input_data/bam_files -fastxdir: 00_input_data/fastx_files - ### JOB # QC qcdir: 01_raw_data_QC -- GitLab From 4fccd08537a8ea4355ac250e549f5b6ad5eec311 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:36:19 +0100 Subject: [PATCH 050/178] add updated config file for SLURM plugin --- .config/snakemake/profiles/slurm/config.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .config/snakemake/profiles/slurm/config.yaml diff --git a/.config/snakemake/profiles/slurm/config.yaml b/.config/snakemake/profiles/slurm/config.yaml new file mode 100644 index 0000000..24dbe1d --- /dev/null +++ b/.config/snakemake/profiles/slurm/config.yaml @@ -0,0 +1,9 @@ +executor: slurm +jobs: 10 +use-singularity: true +singularity-args: "--bind $(pwd)" + +default-resources: + #slurm_account: add if needed on your hpc + runtime: 60 + cpus_per_task: 1 \ No newline at end of file -- GitLab From 2da94ac00bc1ba22aac5c13420e677787ad6271b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:36:40 +0100 Subject: [PATCH 051/178] relocate the masterconfig --- .snakemake/masterconfig.yaml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .snakemake/masterconfig.yaml diff --git a/.snakemake/masterconfig.yaml b/.snakemake/masterconfig.yaml new file mode 100644 index 0000000..335c7da --- /dev/null +++ b/.snakemake/masterconfig.yaml @@ -0,0 +1,34 @@ +# Config file + +samples: + test_sample1: + fasta_gz: small_example.fasta.gz + assembly_purge_force: 2 + busco_lineage: eudicots_odb10 + +reference_genome : "my_reference.fasta.gz" +run_quast: True +run_ragtag: True + +container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" +output_dir: "results/" + + +### JOB +# QC +qcdir: 01_raw_data_QC +fqc: 01_fastQC +lqc: 02_longQC +gentools: 03_genometools +kmer: 04_kmer + +# assembly +assembdir: 02_genome_assembly +asm_raw: 01_raw_assembly +asm_purged: 02_after_purge_dups_assembly +asm_conta: 03_uncontaminated_assembly +asm: 00_assembly +asm_qc: 01_assembly_QC + +# number of threads used by pigz +pigz_threads: 4 \ No newline at end of file -- GitLab From f53fddbfd7ab8a7ab7581a44eb47968f1fad24e4 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:36:59 +0100 Subject: [PATCH 052/178] rename temporary snakefile --- workflow/Snakefile_obs.smk | 132 +++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 workflow/Snakefile_obs.smk diff --git a/workflow/Snakefile_obs.smk b/workflow/Snakefile_obs.smk new file mode 100644 index 0000000..6135456 --- /dev/null +++ b/workflow/Snakefile_obs.smk @@ -0,0 +1,132 @@ +configfile: ".config/masterconfig.yaml" + +# Include all the scripts +include: "scripts/from_config/hifiasm_mode.py" +include: "scripts/from_config/parameters.py" +include: "scripts/from_config/target_list.py" +include: "scripts/path_helper.py" + +# Get paths to the WD +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] + +# Include all the rules +include: "rules/01_qc.smk" +include: "rules/02_asm.smk" +include: "rules/03_asm_qc.smk" +include: "rules/03.5_asm_qc_merqury.smk" +include: "rules/04_purge_dups.smk" +include: "rules/05_purged_asm_qc.smk" +include: "rules/05.5_purged_asm_qc_merqury.smk" +include: "rules/06_sym_link_hap.smk" +include: "rules/07_report.smk" +include: "rules/00_runtime.smk" + +# Get the filenames of inputs +IDS=config["IDS"] +bamIDS=check_bam(IDS) +fastqIDS=check_fastq(IDS) + +RUNID = run_id(config["IDS"]) +BID_RUN = run_BFid(bamIDS) +FID_RUN = run_BFid(fastqIDS) + +# Create the list of desired outputs +## For raw data +longqc_output = expand(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC", zip, + run=BID_RUN, Bid=bamIDS), +fastqc_output = expand(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc.{ext}", zip, + run=FID_RUN, Fid=fastqIDS, ext=["html", "zip"]) + +## Reports +REP_ID = for_report(IDS) +RUNID_REG = run_id(REP_ID) +BUSCO_LIN = busco_lin(REP_ID) +### We create additional lists for purge_dups applications +PURGE_ID = for_purge(IDS) +RUNID_PURGE = run_id(PURGE_ID) +BUSCO_LIN_PURGE = busco_lin(PURGE_ID) + +purged_report_output = expand(res_path + "/{runid}/p_report_{id}.{lin}.html", zip, + runid=RUNID_PURGE, id=PURGE_ID, lin = BUSCO_LIN_PURGE ) + +report_output = expand(res_path + "/{runid}/report_{id}.{lin}.html", zip, + runid=RUNID_REG, id=REP_ID, lin = BUSCO_LIN) + +### Same thing for trio +REP_TRIO_ID = for_report(IDS, trio = True) +RUNID_TRIO = run_id(REP_TRIO_ID) +BUSCO_LIN_TRIO = busco_lin(REP_TRIO_ID) +### We create additional lists for purge_dups applications +PURGE_ID_TRIO = for_purge(IDS, trio = True) +RUNID_PURGE_TRIO = run_id(PURGE_ID) +BUSCO_LIN_TRIO_PURGE = busco_lin(PURGE_ID_TRIO) + +purged_report_trio_output = expand(res_path + "/{runid}/p_report_trio_{id}.{lin}.html", zip, + runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin = BUSCO_LIN_TRIO_PURGE) + +report_trio_output = expand(res_path + "/{runid}/report_trio_{id}.{lin}.html", zip, + runid=RUNID_TRIO, id=REP_TRIO_ID, lin = BUSCO_LIN_TRIO) + +# Add symbolic link to final assembly +symb_link1 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, + runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"]) +symb_link2 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, + runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"]) +## PURGE_DUPS CUTOFFS GRAPH +cut_eval1 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, + runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"]) +cut_eval2 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, + runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"]) + +## BUSCO +busco_reg = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt", zip, + runid=RUNID_REG, id=REP_ID, n=["1", "2"], lin = BUSCO_LIN) + +busco_purged_reg = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", zip, + runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"], lin = BUSCO_LIN) + +busco_trio = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt", zip, + runid=RUNID_TRIO, id=REP_TRIO_ID, n=["1", "2"], lin = BUSCO_LIN_TRIO) +busco_purged_trio = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", zip, + runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"], lin = BUSCO_LIN_TRIO) + +## RUNTIME +time = expand(res_path + "/{runid}/runtime.{id}.{lin}.txt", zip, + runid = RUNID_REG, id=REP_ID, lin=BUSCO_LIN) +time_trio = expand(res_path + "/{runid}/runtime_trio.{id}.{lin}.txt", zip, + runid = RUNID_TRIO, id=REP_TRIO_ID, lin=BUSCO_LIN_TRIO) +time_purge = expand(res_path + "/{runid}/p_runtime.{id}.{lin}.txt", zip, + runid = RUNID_PURGE, id=PURGE_ID, lin=BUSCO_LIN) +time_trio_purge = expand(res_path + "/{runid}/p_runtime_trio.{id}.{lin}.txt", zip, + runid = RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin=BUSCO_LIN_TRIO) + +rule_all_input_list = [ + longqc_output, + fastqc_output, + cut_eval1, + cut_eval2, + symb_link1, + symb_link2, + report_output, + report_trio_output, + busco_reg, + busco_purged_reg, + busco_trio, + busco_purged_trio, + time, + time_trio, + time_trio_purge, + time_purge, + purged_report_output, + purged_report_trio_output +] + +#### target files +rule all: + input: + all_input = rule_all_input_list \ No newline at end of file -- GitLab From 6da1a440b0478bf98127245247be3d5739771d91 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:37:42 +0100 Subject: [PATCH 053/178] Add a dynamic hifiasm rule for HIC and default mode --- workflow/Snakefile | 165 +++++++++++---------------------------------- 1 file changed, 39 insertions(+), 126 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 6135456..b9d0b69 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,132 +1,45 @@ configfile: ".config/masterconfig.yaml" +include: "scripts/parameter_retrieval.py" -# Include all the scripts -include: "scripts/from_config/hifiasm_mode.py" -include: "scripts/from_config/parameters.py" -include: "scripts/from_config/target_list.py" -include: "scripts/path_helper.py" +import os +import yaml -# Get paths to the WD -if config["root"].startswith("."): - abs_root_path = get_abs_root_path() - res_path = get_res_path() -else: - abs_root_path = config["root"] - res_path = abs_root_path + "/" + config["resdir"] +container_registry = config.get("container_registry", "docker://registry.forgemia.inra.fr/pangepop/mspangepop") +output_dir = config.get("output_dir", "results/") -# Include all the rules -include: "rules/01_qc.smk" -include: "rules/02_asm.smk" -include: "rules/03_asm_qc.smk" -include: "rules/03.5_asm_qc_merqury.smk" -include: "rules/04_purge_dups.smk" -include: "rules/05_purged_asm_qc.smk" -include: "rules/05.5_purged_asm_qc_merqury.smk" -include: "rules/06_sym_link_hap.smk" -include: "rules/07_report.smk" -include: "rules/00_runtime.smk" - -# Get the filenames of inputs -IDS=config["IDS"] -bamIDS=check_bam(IDS) -fastqIDS=check_fastq(IDS) - -RUNID = run_id(config["IDS"]) -BID_RUN = run_BFid(bamIDS) -FID_RUN = run_BFid(fastqIDS) - -# Create the list of desired outputs -## For raw data -longqc_output = expand(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC", zip, - run=BID_RUN, Bid=bamIDS), -fastqc_output = expand(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc.{ext}", zip, - run=FID_RUN, Fid=fastqIDS, ext=["html", "zip"]) - -## Reports -REP_ID = for_report(IDS) -RUNID_REG = run_id(REP_ID) -BUSCO_LIN = busco_lin(REP_ID) -### We create additional lists for purge_dups applications -PURGE_ID = for_purge(IDS) -RUNID_PURGE = run_id(PURGE_ID) -BUSCO_LIN_PURGE = busco_lin(PURGE_ID) - -purged_report_output = expand(res_path + "/{runid}/p_report_{id}.{lin}.html", zip, - runid=RUNID_PURGE, id=PURGE_ID, lin = BUSCO_LIN_PURGE ) - -report_output = expand(res_path + "/{runid}/report_{id}.{lin}.html", zip, - runid=RUNID_REG, id=REP_ID, lin = BUSCO_LIN) - -### Same thing for trio -REP_TRIO_ID = for_report(IDS, trio = True) -RUNID_TRIO = run_id(REP_TRIO_ID) -BUSCO_LIN_TRIO = busco_lin(REP_TRIO_ID) -### We create additional lists for purge_dups applications -PURGE_ID_TRIO = for_purge(IDS, trio = True) -RUNID_PURGE_TRIO = run_id(PURGE_ID) -BUSCO_LIN_TRIO_PURGE = busco_lin(PURGE_ID_TRIO) - -purged_report_trio_output = expand(res_path + "/{runid}/p_report_trio_{id}.{lin}.html", zip, - runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin = BUSCO_LIN_TRIO_PURGE) - -report_trio_output = expand(res_path + "/{runid}/report_trio_{id}.{lin}.html", zip, - runid=RUNID_TRIO, id=REP_TRIO_ID, lin = BUSCO_LIN_TRIO) - -# Add symbolic link to final assembly -symb_link1 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, - runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"]) -symb_link2 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, - runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"]) -## PURGE_DUPS CUTOFFS GRAPH -cut_eval1 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, - runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"]) -cut_eval2 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, - runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"]) - -## BUSCO -busco_reg = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt", zip, - runid=RUNID_REG, id=REP_ID, n=["1", "2"], lin = BUSCO_LIN) - -busco_purged_reg = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", zip, - runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"], lin = BUSCO_LIN) - -busco_trio = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt", zip, - runid=RUNID_TRIO, id=REP_TRIO_ID, n=["1", "2"], lin = BUSCO_LIN_TRIO) -busco_purged_trio = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", zip, - runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"], lin = BUSCO_LIN_TRIO) - -## RUNTIME -time = expand(res_path + "/{runid}/runtime.{id}.{lin}.txt", zip, - runid = RUNID_REG, id=REP_ID, lin=BUSCO_LIN) -time_trio = expand(res_path + "/{runid}/runtime_trio.{id}.{lin}.txt", zip, - runid = RUNID_TRIO, id=REP_TRIO_ID, lin=BUSCO_LIN_TRIO) -time_purge = expand(res_path + "/{runid}/p_runtime.{id}.{lin}.txt", zip, - runid = RUNID_PURGE, id=PURGE_ID, lin=BUSCO_LIN) -time_trio_purge = expand(res_path + "/{runid}/p_runtime_trio.{id}.{lin}.txt", zip, - runid = RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin=BUSCO_LIN_TRIO) - -rule_all_input_list = [ - longqc_output, - fastqc_output, - cut_eval1, - cut_eval2, - symb_link1, - symb_link2, - report_output, - report_trio_output, - busco_reg, - busco_purged_reg, - busco_trio, - busco_purged_trio, - time, - time_trio, - time_trio_purge, - time_purge, - purged_report_output, - purged_report_trio_output -] - -#### target files rule all: input: - all_input = rule_all_input_list \ No newline at end of file + expand( + os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}.bp.hap1.p_ctg.gfa"), + sample=config["samples"].keys() + ), + expand( + os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}.bp.hap2.p_ctg.gfa"), + sample=config["samples"].keys() + ) + +# Genome assembly using hifiasm +rule hifiasm: + input: + fasta=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + output: + hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap1.p_ctg.gfa"), + hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap2.p_ctg.gfa") + params: + prefix = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}"), + mode = get_mode, + run_1 = lambda wildcards: get_run(wildcards, run=1), + run_2 = lambda wildcards: get_run(wildcards, run=2), + purge_force = get_purge_force + benchmark: + os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hifiasm_benchmark.txt") + threads: 20 + resources: + mem_mb=250000, + time="10:00:00" + container: + f"{container_registry}/hifiasm:0.19.6" + "" + shell: + "./workflow/scripts/hifiasm_call.sh {params.mode} {params.purge_force} {threads} {input.fasta} {params.run_1} {params.run_2} {params.prefix}" + -- GitLab From b9a28ec43e75679eb765af9647531458bd484407 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:38:09 +0100 Subject: [PATCH 054/178] Add bash script to dynamicly run the rigth hifiasm mode --- workflow/scripts/hifiasm_call.sh | 44 ++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 workflow/scripts/hifiasm_call.sh diff --git a/workflow/scripts/hifiasm_call.sh b/workflow/scripts/hifiasm_call.sh new file mode 100755 index 0000000..fc0e1f2 --- /dev/null +++ b/workflow/scripts/hifiasm_call.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Script to dynamically run hifiasm with the correct command based on the mode +# Author: Lucien PIAT +# For: Project Pangenoak +# Date: January 6, 2025 + +# Usage: ./hifiasm_call.sh mode purge_force threads input [run_1] [run_2] + +MODE=$1 +PURGE_FORCE=$2 +THREADS=$3 +INPUT=$4 +RUN_1=$5 +RUN_2=$6 +PREFIX=$7 + +echo "MODE: $MODE" +echo "PURGE_FORCE: $PURGE_FORCE" +echo "THREADS: $THREADS" +echo "INPUT: $INPUT" +echo "RUN_1: $RUN_1" +echo "RUN_2: $RUN_2" +echo "PREFIX: $PREFIX" + + +echo ${INPUT} +ls + +# Run the appropriate hifiasm command based on the mode +case "$MODE" in + default) + echo "Running in default mode..." + hifiasm -l${PURGE_FORCE} -o ${PREFIX} -t ${THREADS} ${INPUT} + ;; + hi-c) + echo "Running in hi-c mode..." + hifiasm -l${PURGE_FORCE} -o ${PREFIX} -t ${THREADS} --h1 ${RUN_1} --h2 ${RUN_2} ${INPUT} + mv ${PREFIX}.hic.hap1.p_ctg.gfa ${PREFIX}.bp.hap1.p_ctg.gfa + mv ${PREFIX}.hic.hap2.p_ctg.gfa ${PREFIX}.bp.hap2.p_ctg.gfa + ;; + *) + echo "Unknown mode: $MODE" + ;; +esac -- GitLab From 9a71f8bb983343f2d9c71d7635d7264e5a10579e Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 11:38:20 +0100 Subject: [PATCH 055/178] remove old SLURM config --- .../snakemake_profile/slurm/CookieCutter.py | 31 -- .../slurm/cluster_config.yml | 65 ---- .config/snakemake_profile/slurm/config.yaml | 14 - .config/snakemake_profile/slurm/settings.json | 6 - .../slurm/slurm-jobscript.sh | 3 - .../snakemake_profile/slurm/slurm-status.py | 72 ---- .../snakemake_profile/slurm/slurm-submit.py | 61 ---- .../snakemake_profile/slurm/slurm_utils.py | 345 ------------------ workflow/pre-job_snakefiles/Snakefile1.smk | 78 ---- workflow/pre-job_snakefiles/Snakefile2.smk | 78 ---- workflow/pre-job_snakefiles/Snakefile3.smk | 51 --- workflow/rules/00_runtime.smk | 90 ----- workflow/scripts/path_helper.py | 29 -- 13 files changed, 923 deletions(-) delete mode 100644 .config/snakemake_profile/slurm/CookieCutter.py delete mode 100644 .config/snakemake_profile/slurm/cluster_config.yml delete mode 100644 .config/snakemake_profile/slurm/config.yaml delete mode 100644 .config/snakemake_profile/slurm/settings.json delete mode 100755 .config/snakemake_profile/slurm/slurm-jobscript.sh delete mode 100755 .config/snakemake_profile/slurm/slurm-status.py delete mode 100755 .config/snakemake_profile/slurm/slurm-submit.py delete mode 100644 .config/snakemake_profile/slurm/slurm_utils.py delete mode 100644 workflow/pre-job_snakefiles/Snakefile1.smk delete mode 100644 workflow/pre-job_snakefiles/Snakefile2.smk delete mode 100644 workflow/pre-job_snakefiles/Snakefile3.smk delete mode 100644 workflow/rules/00_runtime.smk delete mode 100644 workflow/scripts/path_helper.py diff --git a/.config/snakemake_profile/slurm/CookieCutter.py b/.config/snakemake_profile/slurm/CookieCutter.py deleted file mode 100644 index 19d61df..0000000 --- a/.config/snakemake_profile/slurm/CookieCutter.py +++ /dev/null @@ -1,31 +0,0 @@ -# -# Based on lsf CookieCutter.py -# -import os -import json - -d = os.path.dirname(__file__) -with open(os.path.join(d, "settings.json")) as fh: - settings = json.load(fh) - - -class CookieCutter: - - SBATCH_DEFAULTS = settings['SBATCH_DEFAULTS'] - CLUSTER_NAME = settings['CLUSTER_NAME'] - CLUSTER_CONFIG = settings['CLUSTER_CONFIG'] - ADVANCED_ARGUMENT_CONVERSION = settings['ADVANCED_ARGUMENT_CONVERSION'] - - @staticmethod - def get_cluster_option() -> str: - cluster = CookieCutter.CLUSTER_NAME - if cluster != "": - return f"--cluster={cluster}" - return "" - - @staticmethod - def get_advanced_argument_conversion() -> bool: - val = {"yes": True, "no": False}[ - CookieCutter.ADVANCED_ARGUMENT_CONVERSION - ] - return val diff --git a/.config/snakemake_profile/slurm/cluster_config.yml b/.config/snakemake_profile/slurm/cluster_config.yml deleted file mode 100644 index fea0e13..0000000 --- a/.config/snakemake_profile/slurm/cluster_config.yml +++ /dev/null @@ -1,65 +0,0 @@ -### default ressources used by snakemake (applied to all rules) -__default__: - job-name: "{rule}" - time: "96:00:00" # max run time "hours:minutes:seconds" - ntasks: 1 # nb of processes - cpus-per-task: 4 # nb of cores for each process(1 process) - mem: "60G" - nodes: 1 # Requirements nodes/servers (default: 1) - ntasks-per-node: 1 # Requirements cpu/core/task (default: 1) - output: "slurm_logs/{rule}.%N.%j.out" - error: "slurm_logs/{rule}.%N.%j.err" - mail-type: END,FAIL #email notification - mail-user: lucien.piat@inrae.fr - -### rule resources -# convert with seqtk -convert_to_fasta: - cpus-per-task: 10 - -# LTR_retriever -LTR_retriever: - cpus-per-task: 10 - -# BUSCO -busco: - mem: "100G" - cpus-per-task: 20 - -# assembly -hifiasm: - mem: "250G" - cpus-per-task: 20 - -hifiasm_hic: - mem: "250G" - cpus-per-task: 20 - -hifiasm_trio: - mem: "250G" - cpus-per-task: 20 - -# purge_dups -purge_dups: - mem: "100G" - cpus-per-task: 20 - -purge_dups_cutoffs: - mem: "100G" - cpus-per-task: 20 - -# merqury -merqury: - cpus-per-task: 20 - -meryl: - cpus-per-task: 10 - -meryl_trio: - cpus-per-task: 10 - -merqury_trio: - cpus-per-task: 20 - -purge_merqury_trio: - cpus-per-task: 20 diff --git a/.config/snakemake_profile/slurm/config.yaml b/.config/snakemake_profile/slurm/config.yaml deleted file mode 100644 index 01741b3..0000000 --- a/.config/snakemake_profile/slurm/config.yaml +++ /dev/null @@ -1,14 +0,0 @@ -restart-times: 0 -jobscript: "slurm-jobscript.sh" -cluster: "slurm-submit.py" -cluster-status: "slurm-status.py" -max-jobs-per-second: 1 -max-status-checks-per-second: 10 -local-cores: 1 -latency-wait: 60 - -## added snakemake settings -keep-going: True # Go on with independent jobs if a job fails -rerun-incomplete: True # Re-run all jobs the output of which is recognized as incomplete. -# keep-incomplete: True # keep results even if failed -# unlock: True \ No newline at end of file diff --git a/.config/snakemake_profile/slurm/settings.json b/.config/snakemake_profile/slurm/settings.json deleted file mode 100644 index 5ffbf2b..0000000 --- a/.config/snakemake_profile/slurm/settings.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "SBATCH_DEFAULTS": "", - "CLUSTER_NAME": "", - "CLUSTER_CONFIG": "", - "ADVANCED_ARGUMENT_CONVERSION": "no" -} diff --git a/.config/snakemake_profile/slurm/slurm-jobscript.sh b/.config/snakemake_profile/slurm/slurm-jobscript.sh deleted file mode 100755 index 391741e..0000000 --- a/.config/snakemake_profile/slurm/slurm-jobscript.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -# properties = {properties} -{exec_job} diff --git a/.config/snakemake_profile/slurm/slurm-status.py b/.config/snakemake_profile/slurm/slurm-status.py deleted file mode 100755 index 6dc2323..0000000 --- a/.config/snakemake_profile/slurm/slurm-status.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -import re -import subprocess as sp -import shlex -import sys -import time -import logging -from CookieCutter import CookieCutter - -logger = logging.getLogger("__name__") - -STATUS_ATTEMPTS = 20 - -jobid = sys.argv[1] - -cluster = CookieCutter.get_cluster_option() - -for i in range(STATUS_ATTEMPTS): - try: - sacct_res = sp.check_output(shlex.split(f"sacct {cluster} -P -b -j {jobid} -n")) - res = { - x.split("|")[0]: x.split("|")[1] - for x in sacct_res.decode().strip().split("\n") - } - break - except sp.CalledProcessError as e: - logger.error("sacct process error") - logger.error(e) - except IndexError as e: - logger.error(e) - pass - # Try getting job with scontrol instead in case sacct is misconfigured - try: - sctrl_res = sp.check_output( - shlex.split(f"scontrol {cluster} -o show job {jobid}") - ) - m = re.search(r"JobState=(\w+)", sctrl_res.decode()) - res = {jobid: m.group(1)} - break - except sp.CalledProcessError as e: - logger.error("scontrol process error") - logger.error(e) - if i >= STATUS_ATTEMPTS - 1: - print("failed") - exit(0) - else: - time.sleep(1) - -status = res[jobid] - -if status == "BOOT_FAIL": - print("failed") -elif status == "OUT_OF_MEMORY": - print("failed") -elif status.startswith("CANCELLED"): - print("failed") -elif status == "COMPLETED": - print("success") -elif status == "DEADLINE": - print("failed") -elif status == "FAILED": - print("failed") -elif status == "NODE_FAIL": - print("failed") -elif status == "PREEMPTED": - print("failed") -elif status == "TIMEOUT": - print("failed") -elif status == "SUSPENDED": - print("running") -else: - print("running") diff --git a/.config/snakemake_profile/slurm/slurm-submit.py b/.config/snakemake_profile/slurm/slurm-submit.py deleted file mode 100755 index 296b756..0000000 --- a/.config/snakemake_profile/slurm/slurm-submit.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -""" -Snakemake SLURM submit script. -""" -from snakemake.utils import read_job_properties - -import slurm_utils -from CookieCutter import CookieCutter - -# cookiecutter arguments -SBATCH_DEFAULTS = CookieCutter.SBATCH_DEFAULTS -CLUSTER = CookieCutter.get_cluster_option() -CLUSTER_CONFIG = "" -ADVANCED_ARGUMENT_CONVERSION = CookieCutter.get_advanced_argument_conversion() - -RESOURCE_MAPPING = { - "time": ("time", "runtime", "walltime"), - "mem": ("mem", "mem_mb", "ram", "memory"), - "mem-per-cpu": ("mem-per-cpu", "mem_per_cpu", "mem_per_thread"), - "nodes": ("nodes", "nnodes"), - "partition": ("partition", "queue"), -} - -# parse job -jobscript = slurm_utils.parse_jobscript() -job_properties = read_job_properties(jobscript) - -sbatch_options = {} -cluster_config = slurm_utils.load_cluster_config(CLUSTER_CONFIG) - -# 1) sbatch default arguments and cluster -sbatch_options.update(slurm_utils.parse_sbatch_defaults(SBATCH_DEFAULTS)) -sbatch_options.update(slurm_utils.parse_sbatch_defaults(CLUSTER)) - -# 2) cluster_config defaults -sbatch_options.update(cluster_config["__default__"]) - -# 3) Convert resources (no unit conversion!) and threads -sbatch_options.update( - slurm_utils.convert_job_properties(job_properties, RESOURCE_MAPPING) -) - -# 4) cluster_config for particular rule -sbatch_options.update(cluster_config.get(job_properties.get("rule"), {})) - -# 5) cluster_config options -sbatch_options.update(job_properties.get("cluster", {})) - -# 6) Advanced conversion of parameters -if ADVANCED_ARGUMENT_CONVERSION: - sbatch_options = slurm_utils.advanced_argument_conversion(sbatch_options) - -# 7) Format pattern in snakemake style -sbatch_options = slurm_utils.format_values(sbatch_options, job_properties) - -# ensure sbatch output dirs exist -for o in ("output", "error"): - slurm_utils.ensure_dirs_exist(sbatch_options[o]) if o in sbatch_options else None - -# submit job and echo id back to Snakemake (must be the only stdout) -print(slurm_utils.submit_job(jobscript, **sbatch_options)) diff --git a/.config/snakemake_profile/slurm/slurm_utils.py b/.config/snakemake_profile/slurm/slurm_utils.py deleted file mode 100644 index d43c070..0000000 --- a/.config/snakemake_profile/slurm/slurm_utils.py +++ /dev/null @@ -1,345 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -from os.path import dirname -import re -import math -import argparse -import subprocess as sp -from io import StringIO - -from snakemake import io -from snakemake.io import Wildcards -from snakemake.utils import SequenceFormatter -from snakemake.utils import AlwaysQuotedFormatter -from snakemake.utils import QuotedFormatter -from snakemake.exceptions import WorkflowError -from snakemake.logging import logger - -from CookieCutter import CookieCutter - - -def _convert_units_to_mb(memory): - """If memory is specified with SI unit, convert to MB""" - if isinstance(memory, int) or isinstance(memory, float): - return int(memory) - siunits = {"K": 1e-3, "M": 1, "G": 1e3, "T": 1e6} - regex = re.compile(r"(\d+)({})$".format("|".join(siunits.keys()))) - m = regex.match(memory) - if m is None: - logger.error( - ( - f"unsupported memory specification '{memory}';" - " allowed suffixes: [K|M|G|T]" - ) - ) - sys.exit(1) - factor = siunits[m.group(2)] - return int(int(m.group(1)) * factor) - - -def parse_jobscript(): - """Minimal CLI to require/only accept single positional argument.""" - p = argparse.ArgumentParser(description="SLURM snakemake submit script") - p.add_argument("jobscript", help="Snakemake jobscript with job properties.") - return p.parse_args().jobscript - - -def parse_sbatch_defaults(parsed): - """Unpack SBATCH_DEFAULTS.""" - d = parsed.split() if type(parsed) == str else parsed - args = {} - for keyval in [a.split("=") for a in d]: - k = keyval[0].strip().strip("-") - v = keyval[1].strip() if len(keyval) == 2 else None - args[k] = v - return args - - -def load_cluster_config(path): - """Load config to dict - - Load configuration to dict either from absolute path or relative - to profile dir. - """ - if path: - path = os.path.join(dirname(__file__), os.path.expandvars(path)) - dcc = io.load_configfile(path) - else: - dcc = {} - if "__default__" not in dcc: - dcc["__default__"] = {} - return dcc - - -# adapted from format function in snakemake.utils -def format(_pattern, _quote_all=False, **kwargs): # noqa: A001 - """Format a pattern in Snakemake style. - This means that keywords embedded in braces are replaced by any variable - values that are available in the current namespace. - """ - fmt = SequenceFormatter(separator=" ") - if _quote_all: - fmt.element_formatter = AlwaysQuotedFormatter() - else: - fmt.element_formatter = QuotedFormatter() - try: - return fmt.format(_pattern, **kwargs) - except KeyError as ex: - raise NameError( - f"The name {ex} is unknown in this context. Please " - "make sure that you defined that variable. " - "Also note that braces not used for variable access " - "have to be escaped by repeating them " - ) - - -# adapted from Job.format_wildcards in snakemake.jobs -def format_wildcards(string, job_properties): - """ Format a string with variables from the job. """ - - class Job(object): - def __init__(self, job_properties): - for key in job_properties: - setattr(self, key, job_properties[key]) - - job = Job(job_properties) - if "params" in job_properties: - job._format_params = Wildcards(fromdict=job_properties["params"]) - else: - job._format_params = None - if "wildcards" in job_properties: - job._format_wildcards = Wildcards(fromdict=job_properties["wildcards"]) - else: - job._format_wildcards = None - _variables = dict() - _variables.update( - dict(params=job._format_params, wildcards=job._format_wildcards) - ) - if hasattr(job, "rule"): - _variables.update(dict(rule=job.rule)) - try: - return format(string, **_variables) - except NameError as ex: - raise WorkflowError( - "NameError with group job {}: {}".format(job.jobid, str(ex)) - ) - except IndexError as ex: - raise WorkflowError( - "IndexError with group job {}: {}".format(job.jobid, str(ex)) - ) - - -# adapted from ClusterExecutor.cluster_params function in snakemake.executor -def format_values(dictionary, job_properties): - formatted = dictionary.copy() - for key, value in list(formatted.items()): - if key == "mem": - value = str(_convert_units_to_mb(value)) - if isinstance(value, str): - try: - formatted[key] = format_wildcards(value, job_properties) - except NameError as e: - msg = "Failed to format cluster config " "entry for job {}.".format( - job_properties["rule"] - ) - raise WorkflowError(msg, e) - return formatted - - -def convert_job_properties(job_properties, resource_mapping=None): - options = {} - if resource_mapping is None: - resource_mapping = {} - resources = job_properties.get("resources", {}) - for k, v in resource_mapping.items(): - options.update({k: resources[i] for i in v if i in resources}) - - if "threads" in job_properties: - options["cpus-per-task"] = job_properties["threads"] - return options - - -def ensure_dirs_exist(path): - """Ensure output folder for Slurm log files exist.""" - di = dirname(path) - if di == "": - return - if not os.path.exists(di): - os.makedirs(di, exist_ok=True) - return - - -def format_sbatch_options(**sbatch_options): - """Format sbatch options""" - options = [] - for k, v in sbatch_options.items(): - val = "" - if v is not None: - val = f"={v}" - options.append(f"--{k}{val}") - return options - - -def submit_job(jobscript, **sbatch_options): - """Submit jobscript and return jobid.""" - options = format_sbatch_options(**sbatch_options) - try: - cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] - res = sp.check_output(cmd) - except sp.CalledProcessError as e: - raise e - # Get jobid - res = res.decode() - try: - jobid = re.search(r"(\d+)", res).group(1) - except Exception as e: - raise e - return jobid - - -def advanced_argument_conversion(arg_dict): - """Experimental adjustment of sbatch arguments to the given or default partition.""" - # Currently not adjusting for multiple node jobs - nodes = int(arg_dict.get("nodes", 1)) - if nodes > 1: - return arg_dict - partition = arg_dict.get("partition", None) or _get_default_partition() - constraint = arg_dict.get("constraint", None) - ncpus = int(arg_dict.get("cpus-per-task", 1)) - runtime = arg_dict.get("time", None) - memory = _convert_units_to_mb(arg_dict.get("mem", 0)) - config = _get_cluster_configuration(partition, constraint, memory) - mem = arg_dict.get("mem", ncpus * min(config["MEMORY_PER_CPU"])) - mem = _convert_units_to_mb(mem) - if mem > max(config["MEMORY"]): - logger.info( - f"requested memory ({mem}) > max memory ({max(config['MEMORY'])}); " - "adjusting memory settings" - ) - mem = max(config["MEMORY"]) - - # Calculate available memory as defined by the number of requested - # cpus times memory per cpu - AVAILABLE_MEM = ncpus * min(config["MEMORY_PER_CPU"]) - # Add additional cpus if memory is larger than AVAILABLE_MEM - if mem > AVAILABLE_MEM: - logger.info( - f"requested memory ({mem}) > " - f"ncpus x MEMORY_PER_CPU ({AVAILABLE_MEM}); " - "trying to adjust number of cpus up" - ) - ncpus = int(math.ceil(mem / min(config["MEMORY_PER_CPU"]))) - if ncpus > max(config["CPUS"]): - logger.info( - f"ncpus ({ncpus}) > available cpus ({max(config['CPUS'])}); " - "adjusting number of cpus down" - ) - ncpus = min(int(max(config["CPUS"])), ncpus) - adjusted_args = {"mem": int(mem), "cpus-per-task": ncpus} - - # Update time. If requested time is larger than maximum allowed time, reset - if runtime: - runtime = time_to_minutes(runtime) - time_limit = max(config["TIMELIMIT_MINUTES"]) - if runtime > time_limit: - logger.info( - f"time (runtime) > time limit {time_limit}; " "adjusting time down" - ) - adjusted_args["time"] = time_limit - - # update and return - arg_dict.update(adjusted_args) - return arg_dict - - -timeformats = [ - re.compile(r"^(?P<days>\d+)-(?P<hours>\d+):(?P<minutes>\d+):(?P<seconds>\d+)$"), - re.compile(r"^(?P<days>\d+)-(?P<hours>\d+):(?P<minutes>\d+)$"), - re.compile(r"^(?P<days>\d+)-(?P<hours>\d+)$"), - re.compile(r"^(?P<hours>\d+):(?P<minutes>\d+):(?P<seconds>\d+)$"), - re.compile(r"^(?P<minutes>\d+):(?P<seconds>\d+)$"), - re.compile(r"^(?P<minutes>\d+)$"), -] - - -def time_to_minutes(time): - """Convert time string to minutes. - - According to slurm: - - Acceptable time formats include "minutes", "minutes:seconds", - "hours:minutes:seconds", "days-hours", "days-hours:minutes" - and "days-hours:minutes:seconds". - - """ - if not isinstance(time, str): - time = str(time) - d = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0} - regex = list(filter(lambda regex: regex.match(time) is not None, timeformats)) - if len(regex) == 0: - return - assert len(regex) == 1, "multiple time formats match" - m = regex[0].match(time) - d.update(m.groupdict()) - minutes = ( - int(d["days"]) * 24 * 60 - + int(d["hours"]) * 60 - + int(d["minutes"]) - + math.ceil(int(d["seconds"]) / 60) - ) - assert minutes > 0, "minutes has to be greater than 0" - return minutes - - -def _get_default_partition(): - """Retrieve default partition for cluster""" - cluster = CookieCutter.get_cluster_option() - cmd = f"sinfo -O partition {cluster}" - res = sp.check_output(cmd.split()) - m = re.search(r"(?P<partition>\S+)\*", res.decode(), re.M) - partition = m.group("partition") - return partition - - -def _get_cluster_configuration(partition, constraints=None, memory=0): - """Retrieve cluster configuration. - - Retrieve cluster configuration for a partition filtered by - constraints, memory and cpus - - """ - try: - import pandas as pd - except ImportError: - print( - "Error: currently advanced argument conversion " - "depends on 'pandas'.", file=sys.stderr - ) - sys.exit(1) - - if constraints: - constraint_set = set(constraints.split(",")) - cluster = CookieCutter.get_cluster_option() - cmd = f"sinfo -e -o %all -p {partition} {cluster}".split() - try: - output = sp.Popen(" ".join(cmd), shell=True, stdout=sp.PIPE).communicate() - except Exception as e: - print(e) - raise - data = re.sub("^CLUSTER:.+\n", "", re.sub(" \\|", "|", output[0].decode())) - df = pd.read_csv(StringIO(data), sep="|") - try: - df["TIMELIMIT_MINUTES"] = df["TIMELIMIT"].apply(time_to_minutes) - df["MEMORY_PER_CPU"] = df["MEMORY"] / df["CPUS"] - df["FEATURE_SET"] = df["AVAIL_FEATURES"].str.split(",").apply(set) - except Exception as e: - print(e) - raise - if constraints: - constraint_set = set(constraints.split(",")) - i = df["FEATURE_SET"].apply(lambda x: len(x.intersection(constraint_set)) > 0) - df = df.loc[i] - memory = min(_convert_units_to_mb(memory), max(df["MEMORY"])) - df = df.loc[df["MEMORY"] >= memory] - return df diff --git a/workflow/pre-job_snakefiles/Snakefile1.smk b/workflow/pre-job_snakefiles/Snakefile1.smk deleted file mode 100644 index da27514..0000000 --- a/workflow/pre-job_snakefiles/Snakefile1.smk +++ /dev/null @@ -1,78 +0,0 @@ -configfile: ".config/masterconfig.yaml" - -######################## Python functions ######################## -import os, re -# tar & tar.gz filename -def get_tar_name(dirpath): - IDS = [] - for file in os.listdir(dirpath): - splitResult = file.split(".") - ext = splitResult[-1] - if ext == "tar": - filename= ".".join(splitResult[:-1]) - IDS.append(filename) - elif ext == "gz": - if splitResult[-2] == "tar": - filename= ".".join(splitResult[:-2]) - IDS.append(filename) - return(IDS) - -# file extension -def data_ext(dir, id): - for filename in os.listdir(dir): - if re.match(id, filename): - splitResult = filename.split(".") - ext = splitResult[-1] - if ext == "tar": - return(str(config["data"] + "/{id}.tar")) - else: - return(str(config["data"] + "/{id}.tar.gz")) - -######################## Snakemake ######################## - -### paths -if config["root"].startswith("."): - abs_root_path = get_abs_root_path() - res_path = get_res_path() -else: - abs_root_path = config["root"] - res_path = abs_root_path + "/" + config["resdir"] - -### get filenames for workflow -if config["get_all_tar_filename"]: - IDS=get_tar_name(config["data"]) -else: - IDS=config["tarIDS"] - -###### results path ###### - -### target files -rule all: - input: - expand("{resdir}/{stepdir}/extract/{id}", resdir=res_path, stepdir=config["rawdir"], id=IDS), - -### rules -# extract & rename files from .tar containing .bam files -rule extract_targz_data: - input: - lambda wildcards: data_ext(config["data"], wildcards.id), - output: - directory("{resdir}/{stepdir}/extract/{id}") - priority: 10 - shell: - "mkdir -p {output} && " - "tar -xf {input} -C {output} && " - "cd {output} && " - "find . -name ""ccs.bam"" -exec mv '{{}}' {wildcards.id}.bam \;" - -# move bam and fasta + fastq files -rule move_files: - params: - root=abs_root_path, - bam_path=abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"], - fastx_path=abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"], - shell: - "cd {params.root} && " - "mkdir -p {params.bam_path} {params.fastx_path} && " - "find . -path ""*/extract/*"" -name ""*.bam"" -exec mv ""{{}}"" {params.bam_path} \; && " - "find . -path ""*/extract/*"" -name ""*.fast*"" -exec mv ""{{}}"" {params.fastx_path} \;" \ No newline at end of file diff --git a/workflow/pre-job_snakefiles/Snakefile2.smk b/workflow/pre-job_snakefiles/Snakefile2.smk deleted file mode 100644 index 1dfcd0d..0000000 --- a/workflow/pre-job_snakefiles/Snakefile2.smk +++ /dev/null @@ -1,78 +0,0 @@ -configfile: ".config/masterconfig.yaml" - -######################## Python functions ######################## -import os -# bam filename -def get_bams_name(dirpath): - IDS = [] - for file in os.listdir(dirpath): - splitResult = file.split(".") - ext = splitResult[-1] - if ext == "bam": - filename= ".".join(splitResult[:-1]) - IDS.append(filename) - return(IDS) - -######################## Snakemake ######################## - -### root path -if config["root"].startswith("."): - abs_root_path = get_abs_root_path() - res_path = get_res_path() -else: - abs_root_path = config["root"] - res_path = abs_root_path + "/" + config["resdir"] - -###### results path ###### - -### get filenames -IDS=get_bams_name(abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"]) - -### target files -rule all: - input: - expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz", id=IDS), - expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) - -### rules -## PacBio .bam conversion with smrtlink -# .bam.pbi needed for bam_to_ conversion rules -rule smrtlink_index: - input: - abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam" - output: - abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam.pbi" - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" - shell: - "pbindex {input}" - -# convert .bam to .fastq.gz -rule smrtlink_bam_to_fastq: - input: - bam = abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", - bam_pbi = rules.smrtlink_index.output - output: - abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" - params: - prefix=abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" - priority: 2 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" - shell: - "bam2fastq -o {params.prefix} {input.bam}" - -# convert .bam to .fasta.gz -rule smrtlink_bam_to_fasta: - input: - bam = abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", - bam_pbi = rules.smrtlink_index.output - output: - abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" - params: - prefix=abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" - priority: 2 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" - shell: - "bam2fasta -o {params.prefix} {input.bam}" \ No newline at end of file diff --git a/workflow/pre-job_snakefiles/Snakefile3.smk b/workflow/pre-job_snakefiles/Snakefile3.smk deleted file mode 100644 index 757ab6b..0000000 --- a/workflow/pre-job_snakefiles/Snakefile3.smk +++ /dev/null @@ -1,51 +0,0 @@ -configfile: ".config/masterconfig.yaml" - -######################## Python functions ######################## -import os -# fastq without fasta filename -def get_fastq_name(dirpath): - IDS = [] - for file in os.listdir(dirpath): - splitResult = file.split(".") - ext = splitResult[-1] - if ext == "gz": - if splitResult[-2] == "fastq": - filename= ".".join(splitResult[:-2]) - fasta_filename = dirpath + "/" + filename + ".fasta.gz" - if not os.path.exists(fasta_filename): - IDS.append(filename) - return(IDS) - -######################## Snakemake ######################## -### root path -if config["root"].startswith("."): - abs_root_path = get_abs_root_path() - res_path = get_res_path() -else: - abs_root_path = config["root"] - res_path = abs_root_path + "/" + config["resdir"] - -###### results path ###### - -### get filenames -IDS = get_fastq_name(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"]) - -### target files -rule all: - input: - expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) - -### rules -# if only fastq : convert to fasta with seqtk + zip -rule convert_to_fasta: - input: - abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" - output: - abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" - params: - path=abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] - threads: 10 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/seqtk1.3" - shell: - "seqtk seq -a {input} | gzip > {output}" \ No newline at end of file diff --git a/workflow/rules/00_runtime.smk b/workflow/rules/00_runtime.smk deleted file mode 100644 index b7c7b56..0000000 --- a/workflow/rules/00_runtime.smk +++ /dev/null @@ -1,90 +0,0 @@ -rule start_time: - output: - temp(res_path + "/{runid}/runtime.txt") - priority: 20 - run: - import time - start = time.time() - with open(output[0], "w") as out: - out.write(str(start)) - -rule elasped_time: - input: - rules.start_time.output, - rules.rename_report.output - output: - res_path + "/{runid}/p_runtime.{id}.{lin}.txt" - run: - import time - from datetime import timedelta - - with open(input[0], "r") as inp: - start = inp.read() - - end = time.time() - elapsed_time = end - float(start) - td = timedelta(seconds=elapsed_time) - - with open(output[0], "w") as out: - out.write("Runtime (hh:mm:ss): " + str(td)) - -rule elasped_time_no_purge: - input: - rules.start_time.output, - rules.rename_no_purge_report.output - output: - res_path + "/{runid}/runtime.{id}.{lin}.txt" - run: - import time - from datetime import timedelta - - with open(input[0], "r") as inp: - start = inp.read() - - end = time.time() - elapsed_time = end - float(start) - td = timedelta(seconds=elapsed_time) - - with open(output[0], "w") as out: - out.write("Runtime (hh:mm:ss): " + str(td)) - - -rule elasped_time_trio: - input: - rules.start_time.output, - rules.rename_report_trio.output - output: - res_path + "/{runid}/p_runtime_trio.{id}.{lin}.txt" - run: - import time - from datetime import timedelta - - with open(input[0], "r") as inp: - start = inp.read() - - end = time.time() - elapsed_time = end - float(start) - td = timedelta(seconds=elapsed_time) - - with open(output[0], "w") as out: - out.write("Runtime (hh:mm:ss): " + str(td)) - -rule elasped_time_trio_no_purge: - input: - rules.start_time.output, - rules.no_purge_rename_report_trio.output - output: - res_path + "/{runid}/runtime_trio.{id}.{lin}.txt" - run: - import time - from datetime import timedelta - - with open(input[0], "r") as inp: - start = inp.read() - - end = time.time() - elapsed_time = end - float(start) - td = timedelta(seconds=elapsed_time) - - with open(output[0], "w") as out: - out.write("Runtime (hh:mm:ss): " + str(td)) \ No newline at end of file diff --git a/workflow/scripts/path_helper.py b/workflow/scripts/path_helper.py deleted file mode 100644 index e197410..0000000 --- a/workflow/scripts/path_helper.py +++ /dev/null @@ -1,29 +0,0 @@ -import os - -###### root path ###### -def get_abs_root_path(): - abs_root_path = os.path.abspath(config["root"]) - return(abs_root_path) - - -###### results path ###### -def get_res_path(): - abs_root_path = os.path.abspath(config["root"]) - res_path= abs_root_path + "/" + config["resdir"] - return(res_path) - -def find_all_assemblies(): - """ - Output all the asembled genomes in the results directories - """ - res_path = get_res_path() - assemblies = [] - for root, _, files in os.walk(res_path): - for file in files: - if file.endswith(".fa.gz"): - assemblies.append(os.path.join(root, file)) - return assemblies - -def get_ref(): - ref_path = os.path.abspath(config["reference_genome"]) - return(ref_path) \ No newline at end of file -- GitLab From 9e7a6ea744d696dea70eb11ba031563ede16e6c7 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 13:40:20 +0100 Subject: [PATCH 056/178] add pigz rule for fasta conversion --- .config/masterconfig.yaml | 2 -- workflow/Snakefile | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index e639ff0..04c82e6 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -37,5 +37,3 @@ asm_conta: 03_uncontaminated_assembly asm: 00_assembly asm_qc: 01_assembly_QC -# number of threads used by pigz -pigz_threads: 4 \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index b9d0b69..a767815 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -10,11 +10,11 @@ output_dir = config.get("output_dir", "results/") rule all: input: expand( - os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}.bp.hap1.p_ctg.gfa"), + os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fa.gz"), sample=config["samples"].keys() ), expand( - os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}.bp.hap2.p_ctg.gfa"), + os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fa.gz"), sample=config["samples"].keys() ) @@ -43,3 +43,31 @@ rule hifiasm: shell: "./workflow/scripts/hifiasm_call.sh {params.mode} {params.purge_force} {threads} {input.fasta} {params.run_1} {params.run_2} {params.prefix}" +# Convert the gfa files of hifiasm to fasta +TO_FA_CMD = r"""/^S/{print ">"$2;print $3}""" + +rule pigz_gfa_to_fasta: + input: + hap1_gfa = rules.hifiasm.output.hap1, + hap2_gfa = rules.hifiasm.output.hap2 + output: + hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap1.fa.gz"), + hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap2.fa.gz") + threads: 4 + resources: + mem_mb=25000, + container: + f"{container_registry}/pigz" + shell: + """awk {TO_FA_CMD:q} {input.hap1_gfa} | pigz -p {threads} > {output.hap1} &&""" + """awk {TO_FA_CMD:q} {input.hap2_gfa} | pigz -p {threads} > {output.hap2}""" + +# Potentialy purge the haplotigs using purge_dups +rule pruge_haplotigs: + input: + hap1_fasta = rules.pigz_gfa_to_fasta.output.hap1, + hap2_fasta = rules.pigz_gfa_to_fasta.output.hap2 + output: + hap1 = os.path.join(output_dir, "{sample}_results", "02_final_asembly","{sample}_final_hap1.fasta.gz"), + hap2 = os.path.join(output_dir, "{sample}_results", "02_final_asembly","{sample}_final_hap2.fasta.gz") + params: \ No newline at end of file -- GitLab From 4af769062558fbb2bbff5b8ad1b2366f27455a62 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 13:40:37 +0100 Subject: [PATCH 057/178] remove obsolete file --- workflow/rules/02_asm.smk | 102 -------------------------------------- 1 file changed, 102 deletions(-) delete mode 100644 workflow/rules/02_asm.smk diff --git a/workflow/rules/02_asm.smk b/workflow/rules/02_asm.smk deleted file mode 100644 index 02fa590..0000000 --- a/workflow/rules/02_asm.smk +++ /dev/null @@ -1,102 +0,0 @@ - -### haplotypes assembly -# REGULAR MODE -rule hifiasm: - input: - get_fasta - output: - hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap1.p_ctg.gfa", - hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap2.p_ctg.gfa" - params: - prefix = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}", - purge_force = get_purge_force - benchmark: - abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_benchmark.txt" - threads: 20 - resources: - mem_mb=250000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm:0.19.6" - shell: - "hifiasm -l{params.purge_force} -o {params.prefix} -t {threads} {input}" - -# HI-C -rule hifiasm_hic: - input: - # Hi-C paired-end reads - r1 = get_r1, - r2 = get_r2, - # hifi reads - hifi = get_fasta - output: - hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap1.p_ctg.gfa", - hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap2.p_ctg.gfa" - params: - prefix= abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}", - purge_force = get_purge_force - benchmark: - abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_hic_benchmark.txt" - threads: 20 - resources: - mem_mb=250000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm:0.19.6" - shell: - "hifiasm -l{params.purge_force} -o {params.prefix} -t {threads} --h1 {input.r1} --h2 {input.r2} {input.hifi}" - -# TRIO BINNING -rule yak: - input: - p1 = get_p1, - p2 = get_p2 - output: - p1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent1.yak", - p2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent2.yak" - benchmark: - abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_yak_benchmark.txt" - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/yak:0.1" - shell: - "yak count -k31 -b37 -t16 -o {output.p1} {input.p1} && " - "yak count -k31 -b37 -t16 -o {output.p2} {input.p2}" - -### trio binning assembly -rule hifiasm_trio: - input: - p1 = rules.yak.output.p1, - p2 = rules.yak.output.p2, - child = get_fasta - output: - hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap1.p_ctg.gfa", - hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap2.p_ctg.gfa" - params: - prefix = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" - benchmark: - abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_trio_benchmark.txt" - threads: 20 - resources: - mem_mb=250000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm:0.19.6" - shell: - "hifiasm -o {params.prefix} -t {threads} -1 {input.p1} -2 {input.p2} {input.child}" - -### hifiasm haplotypes .gfa to .fa.gz -# variable for awk command -TO_FA_CMD = r"""/^S/{print ">"$2;print $3}""" - -rule hap_gfa_to_fasta: - input: - hap1 = get_mode_hap1, - hap2 = get_mode_hap2 - output: - hap1_fa = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap1.fa.gz", - hap2_fa = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap2.fa.gz" - params: - pigz_p = config["pigz_threads"] - threads: config["pigz_threads"] - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/pigz" - shell: - """awk {TO_FA_CMD:q} {input.hap1} | pigz -p {params.pigz_p} > {output.hap1_fa} &&""" - """awk {TO_FA_CMD:q} {input.hap2} | pigz -p {params.pigz_p} > {output.hap2_fa}""" \ No newline at end of file -- GitLab From bde50ddbf6873c6d834e6d88999b0d78b554ee41 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 16:44:18 +0100 Subject: [PATCH 058/178] Add echos for terminal output --- local_run.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/local_run.sh b/local_run.sh index c26b9f6..94d7544 100755 --- a/local_run.sh +++ b/local_run.sh @@ -17,9 +17,9 @@ run_snakemake() { dag) snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES --dag > dag.dot if [ $? -eq 0 ]; then - echo "DAG has been successfully generated as dag.dot" + echo "Asm4pg -> DAG has been successfully generated as dag.dot" else - echo "Error: Failed to generate DAG." + echo "Asm4pg -> Error: Failed to generate DAG." exit 1 fi ;; @@ -35,9 +35,9 @@ run_snakemake() { # Check if the Snakemake command was successful if [ $? -eq 0 ]; then - echo "Snakemake completed successfully." + echo "Asm4pg -> Snakemake workflow completed successfully." else - echo "Error: Snakemake execution failed." + echo "Asm4pg -> Error: Snakemake workflow execution failed." exit 1 fi } -- GitLab From f4389c7eb003ba650534004db495b6925350536a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 16:44:41 +0100 Subject: [PATCH 059/178] Remove old purge dups rules --- workflow/rules/04_purge_dups.smk | 67 -------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 workflow/rules/04_purge_dups.smk diff --git a/workflow/rules/04_purge_dups.smk b/workflow/rules/04_purge_dups.smk deleted file mode 100644 index ad30517..0000000 --- a/workflow/rules/04_purge_dups.smk +++ /dev/null @@ -1,67 +0,0 @@ -### to purge haplotigs in hifiasm assembly -# input haplotypes -HAP_FA_GZ = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap{n}.fa.gz" - -rule purge_dups_cutoffs: - input: - assembly = HAP_FA_GZ, - reads = get_fasta - output: - paf = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/{id}_hap{n}.paf.gz", - calcuts = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/calcuts.log", - cutoffs = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs" - params: - dir=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}" - benchmark: - res_path + "/{runid}/benchmark/{id}_hap{n}_purgedups_cutoffs.txt" - threads: 20 - resources: - mem_mb=100000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/purge_dups1.2.5" - shell: - # generate paf file - "minimap2 -xasm20 {input.assembly} {input.reads} | gzip -c - > {output.paf} && " - "pbcstat {params.dir}/*.paf.gz -O {params.dir} && " - "calcuts {params.dir}/PB.stat > {output.cutoffs} 2>{output.calcuts}" - -rule purge_dups: - input: - assembly = HAP_FA_GZ, - cutoffs = rules.purge_dups_cutoffs.output.cutoffs - output: - purge = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/{id}_hap{n}.purged.fa", - split = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/{id}_hap{n}.split", - self_paf = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/{id}_hap{n}.split.self.paf.gz", - bed = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/dups.bed", - log = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/purge_dups.log" - params: - dir=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}" - benchmark: - res_path + "/{runid}/benchmark/{id}_hap{n}_purgedups.txt" - threads: 20 - resources: - mem_mb=100000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/purge_dups1.2.5" - shell: - # split assembly & self-self alignment - "split_fa {input.assembly} > {output.split} && " - "minimap2 -xasm5 -DP {output.split} {output.split} | gzip -c - > {output.split}.self.paf.gz && " - # purge haplotigs & overlaps - "purge_dups -2 -T cutoffs -c {params.dir}/PB.base.cov {output.self_paf} > {output.bed} 2> {output.log} && " - # get purged primary and haplotig sequences from draft assembly - "get_seqs -e {output.bed} {input.assembly} -p {params.dir}/{wildcards.id}_hap{wildcards.n}" - -### make purge_dups cutoffs graph -rule cutoffs_eval: - input: - rules.purge_dups_cutoffs.output.cutoffs - output: - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png" - params: - dir=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}", - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/matplotlib0.11.5" - shell: - "python3 workflow/scripts/hist_plot.py -c {input} {params.dir}/PB.stat {output}" \ No newline at end of file -- GitLab From 691bc54197cba7f0a7eecf039c82d4513b594672 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 16:45:04 +0100 Subject: [PATCH 060/178] Add echos for better communication --- workflow/scripts/hifiasm_call.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/workflow/scripts/hifiasm_call.sh b/workflow/scripts/hifiasm_call.sh index fc0e1f2..4a6a1d1 100755 --- a/workflow/scripts/hifiasm_call.sh +++ b/workflow/scripts/hifiasm_call.sh @@ -14,6 +14,7 @@ RUN_1=$5 RUN_2=$6 PREFIX=$7 +echo "Asm4pg -> Given hifiasm parameters :" echo "MODE: $MODE" echo "PURGE_FORCE: $PURGE_FORCE" echo "THREADS: $THREADS" @@ -23,22 +24,19 @@ echo "RUN_2: $RUN_2" echo "PREFIX: $PREFIX" -echo ${INPUT} -ls - # Run the appropriate hifiasm command based on the mode case "$MODE" in default) - echo "Running in default mode..." + echo "Asm4pg -> Running hifiasm in default mode..." hifiasm -l${PURGE_FORCE} -o ${PREFIX} -t ${THREADS} ${INPUT} ;; hi-c) - echo "Running in hi-c mode..." + echo "Asm4pg -> Running hifiasm in hi-c mode..." hifiasm -l${PURGE_FORCE} -o ${PREFIX} -t ${THREADS} --h1 ${RUN_1} --h2 ${RUN_2} ${INPUT} mv ${PREFIX}.hic.hap1.p_ctg.gfa ${PREFIX}.bp.hap1.p_ctg.gfa mv ${PREFIX}.hic.hap2.p_ctg.gfa ${PREFIX}.bp.hap2.p_ctg.gfa ;; *) - echo "Unknown mode: $MODE" + echo "Asm4pg -> Unknown hifiasm mode: $MODE" ;; esac -- GitLab From ecf3f750f87e8cf0c0891108edb46c39615b7e99 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 16:45:32 +0100 Subject: [PATCH 061/178] Update the script so it can run when there are no purged haplotigs --- workflow/scripts/hist_plot.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/workflow/scripts/hist_plot.py b/workflow/scripts/hist_plot.py index 2b9db36..a1fd465 100755 --- a/workflow/scripts/hist_plot.py +++ b/workflow/scripts/hist_plot.py @@ -66,18 +66,28 @@ def mk_plot(hists, cutoffs, ttle, xm, xM, ym, yM, out_fl): if __name__ == "__main__": parser = argparse.ArgumentParser(description='read depth histogram plot') - parser.add_argument('-c', '--cutoffs', type=str, action="store", dest = "con", help ='read depth cutoffs') - parser.add_argument('-y', '--ymin', type=int, action="store", dest = "ymin", help ='set ymin') - parser.add_argument('-x', '--xmin', type=int, action = "store", dest = "xmin", help = 'set xmin') - parser.add_argument('-Y', '--ymax', type=int, action="store", dest = "ymax", help ='set ymax') - parser.add_argument('-X', '--xmax', type=int, action = "store", dest = "xmax", help = 'set xmax') - parser.add_argument('-t', '--title', type = str, action = "store", dest = "title", help = 'figure title [NULL]', default="") - parser.add_argument('-d', '--delim', type = str, action = "store", dest = "delim", help = 'delimiter', default="\t") + parser.add_argument('-c', '--cutoffs', type=str, action="store", dest="con", help='read depth cutoffs') + parser.add_argument('-y', '--ymin', type=int, action="store", dest="ymin", help='set ymin') + parser.add_argument('-x', '--xmin', type=int, action="store", dest="xmin", help='set xmin') + parser.add_argument('-Y', '--ymax', type=int, action="store", dest="ymax", help='set ymax') + parser.add_argument('-X', '--xmax', type=int, action="store", dest="xmax", help='set xmax') + parser.add_argument('-t', '--title', type=str, action="store", dest="title", help='figure title [NULL]', default="") + parser.add_argument('-d', '--delim', type=str, action="store", dest="delim", help='delimiter', default="\t") parser.add_argument('-v', '--version', action='version', version='hist_plot 0.0.0') - parser.add_argument('stat_fn', type=str, action="store", help = "stat file") - parser.add_argument('out_fn', type=str, action="store", help = "output file") + parser.add_argument('stat_fn', type=str, action="store", help="stat file") + parser.add_argument('out_fn', type=str, action="store", help="output file") + opts = parser.parse_args() - hists = col_hist(opts.stat_fn, opts.delim) - cutoffs = get_cutoffs(opts.con) - mk_plot(hists, cutoffs, opts.title, opts.xmin, opts.xmax, opts.ymin, opts.ymax, opts.out_fn) + + # Try to do the grah and exit with an empty image if you cant construct it + try: + hists = col_hist(opts.stat_fn, opts.delim) + cutoffs = get_cutoffs(opts.con) + mk_plot(hists, cutoffs, opts.title, opts.xmin, opts.xmax, opts.ymin, opts.ymax, opts.out_fn) + except Exception as e: + plt.figure(figsize=(8, 6)) + plt.title("No cutoffs found") + plt.axis('off') + plt.savefig(opts.out_fn, dpi=300) + -- GitLab From d9c73b7aea03df2a26e11e93a3c8aaf19fbbe8d8 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 16:46:06 +0100 Subject: [PATCH 062/178] Move the purge boolean fetcher --- workflow/scripts/from_config/parameters.py | 16 ---------------- workflow/scripts/parameter_retrieval.py | 17 ++++++++++++++--- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/workflow/scripts/from_config/parameters.py b/workflow/scripts/from_config/parameters.py index 8745d4e..442ae05 100644 --- a/workflow/scripts/from_config/parameters.py +++ b/workflow/scripts/from_config/parameters.py @@ -40,20 +40,4 @@ def get_bam(wildcards): fq = config[f'{id_name}']["bam"] return(fq) -# Fetch the purge mode, return a boolean from config file -def get_purge(wildcards): - id_name = wildcards.id - try : - purge_bool = config[f'{id_name}']["run_purge_dups"] - except KeyError: - return False - return purge_bool -# Fetch the purge level for hifiasm -def get_purge_force(wildcards): - id_name = wildcards.id - try : - force = config[f'{id_name}']["assembly_purge_force"] - except KeyError: - return '3' - return force \ No newline at end of file diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py index 1b70a38..d3ffc47 100644 --- a/workflow/scripts/parameter_retrieval.py +++ b/workflow/scripts/parameter_retrieval.py @@ -6,21 +6,32 @@ def get_purge_force(wildcards): try : force = config["samples"][wildcards.sample]["assembly_purge_force"] except KeyError: - print('No "assembly_purge_force" specified, using l3 by default') + print('Asm4pg -> No "assembly_purge_force" specified, using l3 by default') return '3' return force +# Fetch the mode for hifiasm def get_mode(wildcards): try : mode = config["samples"][wildcards.sample]["mode"] except KeyError: - print('No "mode" specified, using default') + print('Asm4pg -> No "mode" specified, using default assembly mode for hifiasm') return 'default' return mode +# Fetch r1/r2 fasta file for hi-c def get_run(wildcards, run:int): try : run= config["samples"][wildcards.sample][f"r{run}"] except KeyError: return 'None' - return run \ No newline at end of file + return run + +# Fetch the purge mode, return a boolean from config file +def get_purge_bool(wildcards): + try : + purge_bool = config["samples"][wildcards.sample]["run_purge_dups"] + except KeyError: + print('Asm4pg -> "run_purge_dups" unspecified, using "False" by default') + return False + return purge_bool -- GitLab From a66ae547bcdc5cfda78fa410b18399f3f1170817 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 16:46:59 +0100 Subject: [PATCH 063/178] Script to have a dynamic purge based on user choice --- workflow/scripts/haplotigs_handling.sh | 54 ++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100755 workflow/scripts/haplotigs_handling.sh diff --git a/workflow/scripts/haplotigs_handling.sh b/workflow/scripts/haplotigs_handling.sh new file mode 100755 index 0000000..cd1c2ba --- /dev/null +++ b/workflow/scripts/haplotigs_handling.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Script to dynamically handle haplotigs with the correct command based on the mode +# Author: Lucien PIAT +# For: Project Pangenoak +# Date: January 6, 2025 + +# Usage: ./haplotigs_handling.sh purge_dups_option hap1_fasta hap2_fasta hap1_output hap2_output + +PURGE_DUPS=$1 +HAP_IN=$2 +HAP_OUT=$3 +PREFIX=$4 +READS=$5 +DIRR=$6 + +if [[ "$PURGE_DUPS" == "True" || "$PURGE_DUPS" == "true" ]]; then + + # Run purge_dups on both haplotypes + + echo "Asm4pg -> Running purge_dups on haplotigs..." + + # Create calcutus stats for purge_dups + minimap2 -xasm20 $HAP_IN $READS | gzip -c - > $DIRR/$PREFIX.paf.gz + pbcstat $DIRR/$PREFIX.paf.gz -O $DIRR + calcuts $DIRR/PB.stat > $DIRR/cutoffs 2> $DIRR/calcuts.log + + # Split assembly & self-self alignment + split_fa $HAP_IN > $DIRR/$PREFIX.split + minimap2 -xasm5 -DP $DIRR/$PREFIX.split $DIRR/$PREFIX.split| gzip -c - > $DIRR/$PREFIX.split.self.paf.gz + + # Purge haplotigs & overlaps + purge_dups -2 -T $DIRR/cutoffs -c $DIRR/PB.base.cov $DIRR/$PREFIX.split.self.paf.gz > $DIRR/dups.bed 2> $DIRR/purge_dups.log + + # Get purged primary and haplotig sequences from draft assembly + get_seqs -e $DIRR/dups.bed $HAP_IN -p $DIRR/$PREFIX + + rm $DIRR/dups.bed + rm $DIRR/PB.base.cov + rm $DIRR/PB.cov + rm $DIRR/*paf.gz + rm $DIRR/*split* + mv $DIRR/$PREFIX.purged.fa $HAP_OUT + rm $DIRR/$PREFIX.hap.fa + +else + # If purge_dups is false, create symbolic links to the output location + + echo "Asm4pg -> Purge option is false. Leaving the assembly untouched" + cp $HAP_IN $HAP_OUT #TODO find why ln is not working here + + # Add an empty cutoffs file so snakemake can link the rules + echo "No cutoffs, purge_dups is turned off" > $DIRR/cutoffs +fi + -- GitLab From 98ab4b597ad3f3991edb2bfaf2abfb83f653aa88 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 16:48:05 +0100 Subject: [PATCH 064/178] Merge all rule that used purge_dups into one to dynamicly handle all purging options --- workflow/Snakefile | 77 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index a767815..902f140 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,3 +1,7 @@ +# Author: Lucien PIAT based on Sukanya Denni's work +# For: Project Pangenoak +# Date: January, 2025 + configfile: ".config/masterconfig.yaml" include: "scripts/parameter_retrieval.py" @@ -10,18 +14,20 @@ output_dir = config.get("output_dir", "results/") rule all: input: expand( - os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fa.gz"), - sample=config["samples"].keys() + os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fasta.gz"), + sample=config["samples"].keys(), n =[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fa.gz"), - sample=config["samples"].keys() + os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs_graph_hap{n}.png"), + sample=config["samples"].keys(), n =[1, 2] ) + + # Genome assembly using hifiasm rule hifiasm: input: - fasta=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + reads=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap1.p_ctg.gfa"), hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap2.p_ctg.gfa") @@ -39,9 +45,10 @@ rule hifiasm: time="10:00:00" container: f"{container_registry}/hifiasm:0.19.6" - "" shell: - "./workflow/scripts/hifiasm_call.sh {params.mode} {params.purge_force} {threads} {input.fasta} {params.run_1} {params.run_2} {params.prefix}" + """ + ./workflow/scripts/hifiasm_call.sh {params.mode} {params.purge_force} {threads} {input.reads} {params.run_1} {params.run_2} {params.prefix} + """ # Convert the gfa files of hifiasm to fasta TO_FA_CMD = r"""/^S/{print ">"$2;print $3}""" @@ -49,25 +56,59 @@ TO_FA_CMD = r"""/^S/{print ">"$2;print $3}""" rule pigz_gfa_to_fasta: input: hap1_gfa = rules.hifiasm.output.hap1, - hap2_gfa = rules.hifiasm.output.hap2 + hap2_gfa = rules.hifiasm.output.hap2, + reads = lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: - hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap1.fa.gz"), - hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap2.fa.gz") + hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap1.fasta.gz"), + hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap2.fasta.gz") threads: 4 resources: mem_mb=25000, container: f"{container_registry}/pigz" shell: - """awk {TO_FA_CMD:q} {input.hap1_gfa} | pigz -p {threads} > {output.hap1} &&""" - """awk {TO_FA_CMD:q} {input.hap2_gfa} | pigz -p {threads} > {output.hap2}""" + """ + awk {TO_FA_CMD:q} {input.hap1_gfa} | pigz -p {threads} > {output.hap1} && + awk {TO_FA_CMD:q} {input.hap2_gfa} | pigz -p {threads} > {output.hap2} + """ # Potentialy purge the haplotigs using purge_dups -rule pruge_haplotigs: +rule haplotigs_handling: + input: + hap_fasta = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap{n}.fasta.gz"), + reads=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + output: + hap = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fasta.gz"), + cutoffs = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs") + params: + prefix = "{sample}_hap{n}", + dirr = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}"), + purge_dups_option = get_purge_bool + threads: 20 + resources: + mem_mb=100000, + time="10:00:00" + container: + f"{container_registry}/purge_dups1.2.5" + shell: + """ + ./workflow/scripts/haplotigs_handling.sh {params.purge_dups_option} {input.hap_fasta} {output.hap} {params.prefix} {input.reads} {params.dirr} + """ + + +# Make purge_dups cutoffs graph +rule cutoffs_graph: input: - hap1_fasta = rules.pigz_gfa_to_fasta.output.hap1, - hap2_fasta = rules.pigz_gfa_to_fasta.output.hap2 + rules.haplotigs_handling.output.cutoffs output: - hap1 = os.path.join(output_dir, "{sample}_results", "02_final_asembly","{sample}_final_hap1.fasta.gz"), - hap2 = os.path.join(output_dir, "{sample}_results", "02_final_asembly","{sample}_final_hap2.fasta.gz") - params: \ No newline at end of file + graph = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs_graph_hap{n}.png") + params: + dirr = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}") + threads: 1 + resources: + mem_mb=10000, + time="01:00:00" + container: + f"{container_registry}/matplotlib0.11.5" + shell: + "python3 workflow/scripts/hist_plot.py -c {input} {params.dirr}/PB.stat {output.graph}" \ No newline at end of file -- GitLab From afff0335e1bf4335f8950bed7288425b96187d1a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 17:02:42 +0100 Subject: [PATCH 065/178] Add genometools rule --- workflow/Snakefile | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 902f140..056f042 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -20,10 +20,12 @@ rule all: expand( os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs_graph_hap{n}.png"), sample=config["samples"].keys(), n =[1, 2] + ), + expand( + os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), + sample=config["samples"].keys() ) - - # Genome assembly using hifiasm rule hifiasm: input: @@ -95,7 +97,6 @@ rule haplotigs_handling: ./workflow/scripts/haplotigs_handling.sh {params.purge_dups_option} {input.hap_fasta} {output.hap} {params.prefix} {input.reads} {params.dirr} """ - # Make purge_dups cutoffs graph rule cutoffs_graph: input: @@ -111,4 +112,19 @@ rule cutoffs_graph: container: f"{container_registry}/matplotlib0.11.5" shell: - "python3 workflow/scripts/hist_plot.py -c {input} {params.dirr}/PB.stat {output.graph}" \ No newline at end of file + "python3 workflow/scripts/hist_plot.py -c {input} {params.dirr}/PB.stat {output.graph}" + +rule genometools_on_raw_data: + input: + reads=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + output: + os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt") + priority: 1 + resources: + mem_mb=100000, + time="10:00:00" + threads: 4 + container: + f"{container_registry}/genometools1.5.9" + shell: + "gt seqstat {input.reads} > {output}" -- GitLab From 643242eedd49de6829d226a19fbc9faaae19746b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 7 Jan 2025 17:05:19 +0100 Subject: [PATCH 066/178] update default container registry --- workflow/Snakefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 056f042..c6a8f02 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -8,7 +8,7 @@ include: "scripts/parameter_retrieval.py" import os import yaml -container_registry = config.get("container_registry", "docker://registry.forgemia.inra.fr/pangepop/mspangepop") +container_registry = config.get("container_registry", "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg") output_dir = config.get("output_dir", "results/") rule all: @@ -44,7 +44,7 @@ rule hifiasm: threads: 20 resources: mem_mb=250000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/hifiasm:0.19.6" shell: @@ -66,6 +66,7 @@ rule pigz_gfa_to_fasta: threads: 4 resources: mem_mb=25000, + time="10:00:00" container: f"{container_registry}/pigz" shell: @@ -89,7 +90,7 @@ rule haplotigs_handling: threads: 20 resources: mem_mb=100000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/purge_dups1.2.5" shell: -- GitLab From 15db222b4101a2b96c6e0e7e73f57e9f5cb3162a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 11:25:35 +0100 Subject: [PATCH 067/178] ingore the snakemake produced files --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 14ed260..d831bde 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,5 @@ node_modules node_modules/* !slurm_logs/* -!workflow/* \ No newline at end of file +!workflow/* +.snakemake \ No newline at end of file -- GitLab From d024fd1bf830ea50cdba503b6e481724b356df87 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 14:12:23 +0100 Subject: [PATCH 068/178] update to force rule all execution, improve robustness --- local_run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/local_run.sh b/local_run.sh index 94d7544..f09cadf 100755 --- a/local_run.sh +++ b/local_run.sh @@ -12,10 +12,10 @@ run_snakemake() { case "$option" in dry) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -n + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -n -R all ;; dag) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES --dag > dag.dot + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -R all --dag > dag.dot if [ $? -eq 0 ]; then echo "Asm4pg -> DAG has been successfully generated as dag.dot" else @@ -24,7 +24,7 @@ run_snakemake() { fi ;; run) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES #--forceall + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -R all #--forceall ;; *) echo "Invalid option: $option" -- GitLab From 7372a04881ec8ae418354f8b040e12eec74552ba Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 14:12:58 +0100 Subject: [PATCH 069/178] add a function to retrieve busco lineage from config --- workflow/scripts/parameter_retrieval.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py index d3ffc47..ca433c5 100644 --- a/workflow/scripts/parameter_retrieval.py +++ b/workflow/scripts/parameter_retrieval.py @@ -35,3 +35,11 @@ def get_purge_bool(wildcards): print('Asm4pg -> "run_purge_dups" unspecified, using "False" by default') return False return purge_bool + +def get_busco_lin(wildcards) -> str: + try : + lin = config["samples"][wildcards.sample]["busco_lineage"] + except KeyError: + print('Asm4pg -> "busco_lineage" unspecified for ' + config["samples"][wildcards.sample]+ ', using "eukaryota_odb10" by default') + return "eukaryota_odb10" + return lin \ No newline at end of file -- GitLab From 845428d25cf3cff38ae9498629d384b0f6fe3cd9 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 14:13:26 +0100 Subject: [PATCH 070/178] delete old config --- .snakemake/masterconfig.yaml | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 .snakemake/masterconfig.yaml diff --git a/.snakemake/masterconfig.yaml b/.snakemake/masterconfig.yaml deleted file mode 100644 index 335c7da..0000000 --- a/.snakemake/masterconfig.yaml +++ /dev/null @@ -1,34 +0,0 @@ -# Config file - -samples: - test_sample1: - fasta_gz: small_example.fasta.gz - assembly_purge_force: 2 - busco_lineage: eudicots_odb10 - -reference_genome : "my_reference.fasta.gz" -run_quast: True -run_ragtag: True - -container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" -output_dir: "results/" - - -### JOB -# QC -qcdir: 01_raw_data_QC -fqc: 01_fastQC -lqc: 02_longQC -gentools: 03_genometools -kmer: 04_kmer - -# assembly -assembdir: 02_genome_assembly -asm_raw: 01_raw_assembly -asm_purged: 02_after_purge_dups_assembly -asm_conta: 03_uncontaminated_assembly -asm: 00_assembly -asm_qc: 01_assembly_QC - -# number of threads used by pigz -pigz_threads: 4 \ No newline at end of file -- GitLab From 870bf5e32de40541c111965b581c304c6f38f7d4 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 14:13:44 +0100 Subject: [PATCH 071/178] Simplify config, remove fuss --- .config/masterconfig.yaml | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 04c82e6..29bdd03 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,39 +1,9 @@ # Config file samples: - test_default: + run1: fasta_gz: small_example.fasta.gz - busco_lineage: eudicots_odb10 - test_hi-c: - fasta_gz: small_example.fasta.gz - mode: hi-c - r1: small_example.fasta.gz - r2: small_example.fasta.gz - assembly_purge_force: 2 - busco_lineage: eudicots_odb10 - -reference_genome : "my_reference.fasta.gz" -run_quast: True -run_ragtag: True container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" output_dir: "results/" - - -### JOB -# QC -qcdir: 01_raw_data_QC -fqc: 01_fastQC -lqc: 02_longQC -gentools: 03_genometools -kmer: 04_kmer - -# assembly -assembdir: 02_genome_assembly -asm_raw: 01_raw_assembly -asm_purged: 02_after_purge_dups_assembly -asm_conta: 03_uncontaminated_assembly -asm: 00_assembly -asm_qc: 01_assembly_QC - -- GitLab From 3b9ee3a6ab23edfa8cc3df3c8b431f84b6f5f746 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 14:36:17 +0100 Subject: [PATCH 072/178] Add function to retrieve ploidy from configfile --- workflow/scripts/parameter_retrieval.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py index ca433c5..99fd87b 100644 --- a/workflow/scripts/parameter_retrieval.py +++ b/workflow/scripts/parameter_retrieval.py @@ -42,4 +42,12 @@ def get_busco_lin(wildcards) -> str: except KeyError: print('Asm4pg -> "busco_lineage" unspecified for ' + config["samples"][wildcards.sample]+ ', using "eukaryota_odb10" by default') return "eukaryota_odb10" - return lin \ No newline at end of file + return lin + +def get_ploidy(wildcards) -> int: + try : + ploidy = config["samples"][wildcards.sample]["ploidy"] + except KeyError: + print('Asm4pg -> "ploidy" unspecified for ' + config["samples"][wildcards.sample]+ ', using 2 by default') + return 2 + return ploidy \ No newline at end of file -- GitLab From 15fc5628f887284144f81ed3c29445bfc63ae848 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:12:55 +0100 Subject: [PATCH 073/178] remove obsolete file --- workflow/rules/05_purged_asm_qc.smk | 60 ----------------------------- 1 file changed, 60 deletions(-) delete mode 100644 workflow/rules/05_purged_asm_qc.smk diff --git a/workflow/rules/05_purged_asm_qc.smk b/workflow/rules/05_purged_asm_qc.smk deleted file mode 100644 index 9ce446f..0000000 --- a/workflow/rules/05_purged_asm_qc.smk +++ /dev/null @@ -1,60 +0,0 @@ -### stats on purged haplotypes -# only the input, output and params have been modified, -# commands are the same as the ones in corresponding rule (dir : workflow/rules) - -# reuse busco rule from 03_asm_qc.smk -use rule busco as purge_busco with: - input: - rules.purge_dups.output.purge - output: - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", - params: - prefix=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco", - lineage=get_busco_lin, # get lineage from config - sample="{id}_purged_hap{n}" - benchmark: - res_path + "/{runid}/benchmark/{id}_hap{n}_{lin}_busco_purged.txt" - -# reuse genometools rule from 03_asm_qc.smk -use rule genometools_on_raw_data as purge_genometools with: - input: - rules.purge_dups.output.purge - output: - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/assembly_stats/{id}_purged_hap{n}.AStats.txt" - -# reuse kat rule from 03_asm_qc.smk -use rule kat as purge_kat with: - input: - hap = rules.purge_dups.output.purge, - jellyfish = res_path + "/{runid}/01_raw_data_QC/04_kmer/{id}.jf" - output: - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/katplot/hap{n}/{id}_purged_hap{n}.katplot.png" - params: - prefix="{id}_hap{n}", - path= res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/katplot/hap{n}/{id}_purged_hap{n}" - -rule purge_find_telomeres: - input: - rules.purge_dups.output.purge - output: - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/telomeres/{id}_hap{n}_purged_telomeres.txt" - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/biopython1.75" - shell: - "python3 workflow/scripts/FindTelomeres.py {input} > {output}" - -use rule LTR_finder as purge_LTR_finder with : - input: - rules.purge_dups.output.purge - output: - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/LAI/purge_{id}_hap{n}.scn" - -use rule LTR_retriever as purge_LTR_retriever with : - input: - scn=rules.purge_LTR_finder.output, - genome=rules.purge_dups.output.purge - output: - lai=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/LAI/purge_{id}_hap{n}.out.LAI", - recap=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/LAI/purge_recap_{id}_hap{n}.tbl" - params: - prefix="{id}_hap{n}.purged" \ No newline at end of file -- GitLab From 05d18efc1d0e598678c75ff8b8e2f5d0d7d721f2 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:13:25 +0100 Subject: [PATCH 074/178] Remove duplicate fetch function --- workflow/scripts/from_config/target_list.py | 32 --------------------- 1 file changed, 32 deletions(-) diff --git a/workflow/scripts/from_config/target_list.py b/workflow/scripts/from_config/target_list.py index ec87c79..1d37a96 100644 --- a/workflow/scripts/from_config/target_list.py +++ b/workflow/scripts/from_config/target_list.py @@ -41,36 +41,4 @@ def for_report(id_list, trio =False): NAME.append(i) return(NAME) -#### BUSCO LINEAGE -def busco_lin(id_list): - lineage_list = [] - for i in id_list: - lineage = config[i]["busco_lineage"] - lineage_list.append(lineage) - return(lineage_list) - -########### CHECK IF BAM AND FASTQ ARE AVAILABLE ########### -#### BAM -def check_bam(id_list): - IDS = [] - for i in id_list: - if "bam" in config[i]: - IDS.append(i) - return(IDS) - -#### FASTQ -def check_fastq(id_list): - IDS = [] - for i in id_list: - if "fastq" in config[i]: - IDS.append(i) - return(IDS) -# QUAST -def check_quast(res_path): - """ - Check if run_quast is set to true in the masterconfig - """ - if config["run_quast"]: - return res_path + "/global_quast_report/report.html" - return [] \ No newline at end of file -- GitLab From 71996baf951ac9ab72e02d35fb7bb7348c3c3562 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:16:12 +0100 Subject: [PATCH 075/178] Add dynamic genometools on assembly rule --- workflow/Snakefile | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index c6a8f02..b02b34b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -11,25 +11,31 @@ import yaml container_registry = config.get("container_registry", "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg") output_dir = config.get("output_dir", "results/") + rule all: input: + # Required final assemblies and graphs + expand( + os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "{sample}_final_hap{n}.fasta.gz"), + sample=config["samples"].keys(), n=[1, 2] + ), expand( - os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fasta.gz"), - sample=config["samples"].keys(), n =[1, 2] + os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "cutoffs_graph_hap{n}.png"), + sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs_graph_hap{n}.png"), - sample=config["samples"].keys(), n =[1, 2] + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt"), + sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), - sample=config["samples"].keys() + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "short_summary.specific.{sample}_hap{n}.txt"), + sample=config["samples"].keys(), n=[1, 2] ) - + # Genome assembly using hifiasm rule hifiasm: input: - reads=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + reads = lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap1.p_ctg.gfa"), hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap2.p_ctg.gfa") @@ -79,7 +85,7 @@ rule pigz_gfa_to_fasta: rule haplotigs_handling: input: hap_fasta = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap{n}.fasta.gz"), - reads=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + reads = lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: hap = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fasta.gz"), cutoffs = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs") @@ -115,9 +121,10 @@ rule cutoffs_graph: shell: "python3 workflow/scripts/hist_plot.py -c {input} {params.dirr}/PB.stat {output.graph}" +# Produce basic stats for the reads files rule genometools_on_raw_data: input: - reads=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt") priority: 1 @@ -128,4 +135,14 @@ rule genometools_on_raw_data: container: f"{container_registry}/genometools1.5.9" shell: - "gt seqstat {input.reads} > {output}" + "gt seqstat {input} > {output}" + +# NOT TESTED +# Produce basic stats for each haplotypes assembled +use rule genometools_on_raw_data as genometools_on_assembly with: + input: + rules.haplotigs_handling.output.hap + output: + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt") + priority: 0 + -- GitLab From 43a3ff9f273dcee2b133bafe4b1cd5148f79872d Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:17:10 +0100 Subject: [PATCH 076/178] Add unpigz rule for scripts that needs uncompresed data --- workflow/Snakefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index b02b34b..42ae877 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -146,3 +146,19 @@ use rule genometools_on_raw_data as genometools_on_assembly with: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt") priority: 0 +# NOT TESTED +# Rule to unsip the haplotypes if needed for other rules +rule unpigz_to_fasta: + input: + rules.haplotigs_handling.output.hap + output: + hap_unziped = temp(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fa")) + container: + f"{container_registry}/pigz" + threads: 4 + resources: + mem_mb=25000, + time="10:00:00" + shell: + "unpigz -k -p 1 {input}" + -- GitLab From 4ae2357af1da4643a069167c9c92c2f41c392b15 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:17:53 +0100 Subject: [PATCH 077/178] Add a dynamic rule for BUSCO --- workflow/Snakefile | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index 42ae877..fa27a57 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -162,3 +162,23 @@ rule unpigz_to_fasta: shell: "unpigz -k -p 1 {input}" +# NOT TESTED +# BUSCO stats on assembly (may not work on first run, rerun the WF) +rule busco: + input: + rules.unpigz_to_fasta.output + output: + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "short_summary.specific.{sample}_hap{n}.txt") + params: + prefix = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco"), + lineage=get_busco_lin, + sample="{sample}_hap{n}" + threads: 20 + resources: + mem_mb=100000, + time="10:00:00" + container: + f"{container_registry}/busco:5.7.1" + shell: + "busco -f -i {input} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads}" + -- GitLab From c31ca0483646c26fc614ca4c740528b94035c228 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:18:34 +0100 Subject: [PATCH 078/178] Add dynamic rule to find telomeres --- workflow/Snakefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index fa27a57..3cc4e5b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -182,3 +182,19 @@ rule busco: shell: "busco -f -i {input} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads}" +# NOT TESTED +# Estimate telomeric region content +rule find_telomeres: + input: + rules.unpigz_to_fasta.output + output: + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "telomeres", "{sample}_hap{n}_telomeres.txt") + threads: 4 + resources: + mem_mb=40000 + time="10:00:00" + container: + f"{container_registry}/biopython1.75" + shell: + "python3 workflow/scripts/FindTelomeres.py {input} > {output}" + -- GitLab From d5f9e100d73c52477596c8c5b1159fce3f8401d3 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:19:43 +0100 Subject: [PATCH 079/178] Add a ruleset to analyse jellyfish data --- workflow/Snakefile | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index 3cc4e5b..44a8e8b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -198,3 +198,59 @@ rule find_telomeres: shell: "python3 workflow/scripts/FindTelomeres.py {input} > {output}" +# NOT TESTED +# Count k-mers in the input reads and generate a histogram of k-mer frequencies. +rule jellyfish: + input: + reads = lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + output: + jf = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "jellyfish", "{sample}.jf"), + histo = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "jellyfish", "{sample}.histo") + threads: 10 + resources: + mem_mb=8000 + time="10:00:00" + container: + f"{container_registry}/jellyfish2.3.0" + shell: + "jellyfish count -m 21 -s 100M -t 10 -o {output.jf} -C <(zcat {input.reads}) && " + "jellyfish histo -h 1000000 -t 10 {output.jf} > {output.histo}" + +# NOT TESTED +# Analyze the k-mer histogram and estimate genome characteristics, including genome size, heterozygosity, and error rates. +rule genomescope: + input: + rules.jellyfish.output.histo + output: + plot = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), + params: + ploidy = get_ploidy, + dirr = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope"), + threads: 1 + resources: + mem_mb=40000 + time="10:00:00" + container: + f"{container_registry}/genomescope2.0" + shell: + "genomescope.R -k 21 -i {input} -o {params.dirr} -p {params.ploidy}" + +# NOT TESTED +# Compare the k-mer content between the assembly and the raw reads. +rule kat: + input: + hap = rules.haplotigs_handling.output.hap, + jellyfish = rules.jellyfish.output.histo + output: + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}.katplot.png") + params: + dirr = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}") + threads: 4 + resources: + mem_mb=40000 + time="10:00:00" + container: + f"{container_registry}/kat2.4.1" + shell: + "kat comp -o {params.dirr} -t {threads} -m 21--output_type png -v {input.jellyfish} {input.hap} && " + "kat plot spectra-cn -x 200 -o {params.dirr}.katplot.png {params.path}-main.mx" -- GitLab From 3513139b4685510b7966c40cb9e0c28fde37ab25 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:34:20 +0100 Subject: [PATCH 080/178] Add a function to retrieve km_size for analysis --- workflow/scripts/parameter_retrieval.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py index 99fd87b..5a0bde8 100644 --- a/workflow/scripts/parameter_retrieval.py +++ b/workflow/scripts/parameter_retrieval.py @@ -50,4 +50,12 @@ def get_ploidy(wildcards) -> int: except KeyError: print('Asm4pg -> "ploidy" unspecified for ' + config["samples"][wildcards.sample]+ ', using 2 by default') return 2 - return ploidy \ No newline at end of file + return ploidy + +def get_kmer_size(wildcards) -> int: + try : + size = config["samples"][wildcards.sample]["kmer_size"] + except KeyError: + print('Asm4pg -> "kmer_size" unspecified for ' + config["samples"][wildcards.sample]+ ', using 21 by default') + return 21 + return size \ No newline at end of file -- GitLab From 42bbd9b3df7155f4883d0568141f0400b14c0fbf Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:34:48 +0100 Subject: [PATCH 081/178] modify the rules so that user can choose the size of the kmers --- workflow/Snakefile | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 44a8e8b..ec1c144 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -206,6 +206,8 @@ rule jellyfish: output: jf = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "jellyfish", "{sample}.jf"), histo = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "jellyfish", "{sample}.histo") + params: + km_size = get_kmer_size threads: 10 resources: mem_mb=8000 @@ -213,8 +215,10 @@ rule jellyfish: container: f"{container_registry}/jellyfish2.3.0" shell: - "jellyfish count -m 21 -s 100M -t 10 -o {output.jf} -C <(zcat {input.reads}) && " - "jellyfish histo -h 1000000 -t 10 {output.jf} > {output.histo}" + """ + jellyfish count -m {params.km_size} -s 100M -t {threads} -o {output.jf} -C <(zcat {input.reads}) && + jellyfish histo -h 1000000 -t {threads} {output.jf} > {output.histo} + """ # NOT TESTED # Analyze the k-mer histogram and estimate genome characteristics, including genome size, heterozygosity, and error rates. @@ -222,9 +226,10 @@ rule genomescope: input: rules.jellyfish.output.histo output: - plot = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), + plot = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png") params: ploidy = get_ploidy, + km_size = get_kmer_size, dirr = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope"), threads: 1 resources: @@ -233,7 +238,7 @@ rule genomescope: container: f"{container_registry}/genomescope2.0" shell: - "genomescope.R -k 21 -i {input} -o {params.dirr} -p {params.ploidy}" + "genomescope.R -k {params.km_size} -i {input} -o {params.dirr} -p {params.ploidy}" # NOT TESTED # Compare the k-mer content between the assembly and the raw reads. @@ -244,7 +249,8 @@ rule kat: output: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}.katplot.png") params: - dirr = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}") + dirr = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}"), + km_size = get_kmer_size threads: 4 resources: mem_mb=40000 @@ -252,5 +258,11 @@ rule kat: container: f"{container_registry}/kat2.4.1" shell: - "kat comp -o {params.dirr} -t {threads} -m 21--output_type png -v {input.jellyfish} {input.hap} && " - "kat plot spectra-cn -x 200 -o {params.dirr}.katplot.png {params.path}-main.mx" + """ + kat comp -o {params.dirr} -t {threads} -m {params.km_size}--output_type png -v {input.jellyfish} {input.hap} && + kat plot spectra-cn -x 200 -o {params.dirr}.katplot.png {params.path}-main.mx + """ + +# Creates a k-mer database from the input reads + +# Calculate assembly quality metrics \ No newline at end of file -- GitLab From 010790a2614362a83751a7a7ea9216ab1945064e Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 15:44:11 +0100 Subject: [PATCH 082/178] Add rule to generate kmer database --- workflow/Snakefile | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index ec1c144..f2d6d52 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -191,7 +191,7 @@ rule find_telomeres: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "telomeres", "{sample}_hap{n}_telomeres.txt") threads: 4 resources: - mem_mb=40000 + mem_mb=40000, time="10:00:00" container: f"{container_registry}/biopython1.75" @@ -210,7 +210,7 @@ rule jellyfish: km_size = get_kmer_size threads: 10 resources: - mem_mb=8000 + mem_mb=8000, time="10:00:00" container: f"{container_registry}/jellyfish2.3.0" @@ -233,7 +233,7 @@ rule genomescope: dirr = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope"), threads: 1 resources: - mem_mb=40000 + mem_mb=40000, time="10:00:00" container: f"{container_registry}/genomescope2.0" @@ -253,7 +253,7 @@ rule kat: km_size = get_kmer_size threads: 4 resources: - mem_mb=40000 + mem_mb=40000, time="10:00:00" container: f"{container_registry}/kat2.4.1" @@ -263,6 +263,23 @@ rule kat: kat plot spectra-cn -x 200 -o {params.dirr}.katplot.png {params.path}-main.mx """ +# NOT TESTED # Creates a k-mer database from the input reads +rule meryl: + input: + lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] + output: + os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "merqury", "{sample}_reads-db.meryl") + params: + km_size = get_kmer_size + threads: 20 + resources: + mem_mb=60000, + time="10:00:00" + container: + f"{container_registry}/merqury1.3" + shell: + "meryl k={params.km_size} threads={threads} count {input} output {output}" + -# Calculate assembly quality metrics \ No newline at end of file +# Calculate assembly quality metrics -- GitLab From 97212f7c74458bab9dd3932167820d6a1ac2ca1a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 16:44:43 +0100 Subject: [PATCH 083/178] Add a dynamic rule for merqury --- workflow/Snakefile | 61 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index f2d6d52..c75c544 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -281,5 +281,64 @@ rule meryl: shell: "meryl k={params.km_size} threads={threads} count {input} output {output}" +rule merqury: + input: + km_database = rules.meryl.output, + hap1 = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap1","{sample}_final_hap1.fasta.gz"), + hap2 = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap2","{sample}_final_hap2.fasta.gz"), + output: + qv = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.qv"), + stats = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats") + params: + prefix = "{sample}_merqury", + dirr = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury") + threads: 20 + resources: + mem_mb=60000, + time="10:00:00" + container: + f"{container_registry}/merqury1.3" + shell: + """ + # Create temporary copies of haplotypes + cp {input.hap1} {params.dirr}/tmp_hap1.fasta.gz + cp {input.hap2} {params.dirr}/tmp_hap2.fasta.gz + + # Run Merqury + cd {params.dirr} && \ + export MERQURY=/usr/local/share/merqury && \ + merqury.sh {input.read_db} {params.dirr}/tmp_hap1.fasta.gz {params.dirr}/tmp_hap1.fasta.gz {params.prefix} + """ + +rule LTR_finder: + input: + rules.unzip_hap_fasta.output + output: + res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.scn" + singularity: + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/ltr_finder:latest" + shell: + "ltr_finder -C {input} > {output}" + +rule LTR_retriever: + input: + scn=rules.LTR_finder.output, + genome=rules.unzip_hap_fasta.output + output: + lai=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.out.LAI", + recap=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/recap_{id}_hap{n}.tbl" + params: + prefix="{id}_hap{n}" + threads: 10 + singularity: + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/ltr_retriever:3.0.1" + shell: + 'export PATH="/opt/LTR_retriever:$PATH" && ' + 'LTR_retriever -threads {threads} -genome {input.genome} -infinder {input.scn} && ' + 'mv {params.prefix}.fa.out.LAI {output.lai} && ' + 'mv {params.prefix}.fa.tbl {output.recap} && ' + 'rm {params.prefix}.fa?* && ' + 'rm -rf .RepeatMaskerCache &&' + 'rm {params.prefix}.fa' -# Calculate assembly quality metrics + \ No newline at end of file -- GitLab From 6d994e065ca84a7ecf59f4e07500a43e500d025a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 16:45:07 +0100 Subject: [PATCH 084/178] regroup remaining rules to implement --- workflow/rules/01_qc.smk | 82 -------------- workflow/rules/03_asm_qc.smk | 101 ------------------ workflow/rules/05.5_purged_asm_qc_merqury.smk | 96 ----------------- workflow/rules/06_sym_link_hap.smk | 10 -- ...5_asm_qc_merqury.smk => unimplemented.smk} | 59 ++++------ 5 files changed, 19 insertions(+), 329 deletions(-) delete mode 100644 workflow/rules/01_qc.smk delete mode 100644 workflow/rules/03_asm_qc.smk delete mode 100644 workflow/rules/05.5_purged_asm_qc_merqury.smk delete mode 100644 workflow/rules/06_sym_link_hap.smk rename workflow/rules/{03.5_asm_qc_merqury.smk => unimplemented.smk} (67%) diff --git a/workflow/rules/01_qc.smk b/workflow/rules/01_qc.smk deleted file mode 100644 index 4f7e0b2..0000000 --- a/workflow/rules/01_qc.smk +++ /dev/null @@ -1,82 +0,0 @@ -### QC on .bam files with LongQC -rule longqc: - input: - abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{Bid}.bam" - output: - directory(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC") - benchmark: - res_path + "/{Bid}/{run}/benchmark/longqc.txt" - priority: 1 - threads: 8 - resources: - mem_mb=60000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/longqc1.2.0c" - shell: - "longQC sampleqc -x pb-hifi -o {output} {input}" - -### QC on .fastq.gz files with FastQC -rule fastqc: - input: - get_fastq - output: - multiext(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc", ".html", ".zip") - params: - output_path=res_path + "/{Fid}/{run}//01_raw_data_QC/01_fastQC/" - benchmark: - res_path + "/{Fid}/{run}/benchmark/fastqc.txt" - priority: 1 - threads: 4 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/fastqc:0.12.1" - shell: - "fastqc -o {params.output_path} {input}" - -### read stats - -rule genometools_on_raw_data: - input: - get_fasta - output: - res_path + "/{runid}/01_raw_data_QC/03_genometools/{id}.RawStat.txt" - priority: 1 - threads: 4 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/genometools1.5.9" - shell: - "gt seqstat {input} > {output}" - -### kmer stats -### kmer stats : jellyfish .histo used by genomescope -### assembly stats : jellyfish .jf used by KAT - -rule jellyfish: - input: - get_fasta - output: - jf = res_path + "/{runid}/01_raw_data_QC/04_kmer/{id}.jf", - histo = res_path + "/{runid}/01_raw_data_QC/04_kmer/{id}.histo" - priority: 1 - threads: 4 - resources: - mem_mb=40000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/jellyfish2.3.0" - shell: - "jellyfish count -m 21 -s 100M -t 10 -o {output[0]} -C <(zcat {input}) && " - "jellyfish histo -h 1000000 -t 10 {output[0]} > {output[1]}" - -rule genomescope: - input: - rules.jellyfish.output.histo - output: - d = directory(res_path + "/{runid}/01_raw_data_QC/04_kmer/{id}_genomescope"), - png = res_path + "/{runid}/01_raw_data_QC/04_kmer/{id}_genomescope/linear_plot.png" - params: - ploidy = get_ploidy - priority: 1 - threads: 4 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/genomescope2.0" - shell: - "genomescope.R -k 21 -i {input} -o {output.d} -p {params.ploidy}" \ No newline at end of file diff --git a/workflow/rules/03_asm_qc.smk b/workflow/rules/03_asm_qc.smk deleted file mode 100644 index edae4ec..0000000 --- a/workflow/rules/03_asm_qc.smk +++ /dev/null @@ -1,101 +0,0 @@ -# input haplotypes -HAP_FA_GZ = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap{n}.fa.gz" - -# unzip fasta -rule unzip_hap_fasta: - input: - HAP_FA_GZ - output: - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap{n}.fa" - shell: - "unpigz -k -p 1 {input}" - -### assembly stats with genometools -use rule genometools_on_raw_data as genometools_on_assembly with: - input: - HAP_FA_GZ - output: - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/assembly_stats/{id}_hap{n}.AStats.txt" - -### BUSCO stats on assembly (may not work on first run, rerun the WF) -rule busco: - input: - rules.unzip_hap_fasta.output - output: - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt" - params: - prefix=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco", - lineage=get_busco_lin, # get lineage from config - sample="{id}_hap{n}" - benchmark: - res_path + "/{runid}/benchmark/{id}_hap{n}_{lin}_busco.txt" - threads: 20 - resources: - mem_mb=100000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/busco:5.7.1" - shell: - "busco -f -i {input[0]} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads}" - -### assembly stats -# jellyfish .jf output file directory - -rule kat: - input: - hap = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap{n}.fa.gz", - jellyfish = res_path + "/{runid}/01_raw_data_QC/04_kmer/{id}.jf" - output: - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/katplot/hap{n}/{id}_hap{n}.katplot.png" - params: - prefix="{id}_hap{n}", - path=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/katplot/hap{n}/{id}_hap{n}", - threads: 4 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/kat2.4.1" - shell: - "kat comp -o {params.path} -t {threads} -m 21--output_type png -v {input.jellyfish} {input.hap} && " - "kat plot spectra-cn -x 200 -o {params.path}.katplot.png {params.path}-main.mx" - -# telomeres -rule find_telomeres: - input: - rules.unzip_hap_fasta.output - output: - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/telomeres/{id}_hap{n}_telomeres.txt" - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/biopython1.75" - shell: - "python3 workflow/scripts/FindTelomeres.py {input} > {output}" - -rule LTR_finder: - input: - rules.unzip_hap_fasta.output - output: - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.scn" - singularity: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/ltr_finder:latest" - shell: - "ltr_finder -C {input} > {output}" - -rule LTR_retriever: - input: - scn=rules.LTR_finder.output, - genome=rules.unzip_hap_fasta.output - output: - lai=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.out.LAI", - recap=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/recap_{id}_hap{n}.tbl" - params: - prefix="{id}_hap{n}" - threads: 10 - singularity: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/ltr_retriever:3.0.1" - shell: - 'export PATH="/opt/LTR_retriever:$PATH" && ' - 'LTR_retriever -threads {threads} -genome {input.genome} -infinder {input.scn} && ' - 'mv {params.prefix}.fa.out.LAI {output.lai} && ' - 'mv {params.prefix}.fa.tbl {output.recap} && ' - 'rm {params.prefix}.fa?* && ' - 'rm -rf .RepeatMaskerCache &&' - 'rm {params.prefix}.fa' - - \ No newline at end of file diff --git a/workflow/rules/05.5_purged_asm_qc_merqury.smk b/workflow/rules/05.5_purged_asm_qc_merqury.smk deleted file mode 100644 index 8e88dbf..0000000 --- a/workflow/rules/05.5_purged_asm_qc_merqury.smk +++ /dev/null @@ -1,96 +0,0 @@ -# similar to 03.5 -## copy reads db created with meryl -rule purge_cp_meryl: - input: - rules.meryl.output - output: - temp(directory(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_reads-db_k21.meryl")) - params: - path=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury" - shell: - "cp -r {input} {params.path}" - -# reuse rules from 03.5 -use rule cp_hap as purge_cp with: - input: - hap1=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap1/{id}_hap1.purged.fa", - hap2=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap2/{id}_hap2.purged.fa" - output: - hap1=temp(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_hap1.purged.fa"), - hap2=temp(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_hap2.purged.fa") - params: - path=res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury" - -use rule merqury as purge_merqury with: - input: - read_db = rules.purge_cp_meryl.output, - hap1 = rules.purge_cp.output.hap1, - hap2 = rules.purge_cp.output.hap2 - output: - qv = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury.qv", - stat = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury.completeness.stats" - params: - prefix = "{id}_purge_merqury", - path = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury" - benchmark: - res_path + "/{runid}/benchmark/{id}_merqury_purged.txt" - -######### MERQURY TRIO -rule cp_purge_trio: - input: - meryl_db = rules.meryl.output, - p1 = rules.merqury_trio.output.p1_hapmer, - p2 = rules.merqury_trio.output.p2_hapmer - output: - meryl_db = temp(directory(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_reads-db_k21.meryl")), - p1 = temp(directory(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_P1_reads-db_k21.hapmer.meryl")), - p2 = temp(directory(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_P2_reads-db_k21.hapmer.meryl")) - params: - path = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury", - shell: - "cp -r {input.meryl_db} {params.path} && " - "ln -s {input.p1} {output.p1} && " - "ln -s {input.p2} {output.p2}" - -rule cp_hap_trio: - input: - hap1 = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap1/{id}_hap1.purged.fa", - hap2 = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap2/{id}_hap2.purged.fa", - output: - hap1 = temp(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_hap1.purged.fasta"), - hap2 = temp(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_hap2.purged.fasta"), - params: - path = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury" - shell: - "cp {input.hap1} {output.hap1} && " - "cp {input.hap2} {output.hap2}" - -rule purge_merqury_trio: - input: - p1 = rules.cp_purge_trio.output.p1, - p2 = rules.cp_purge_trio.output.p2, - read_db = rules.cp_purge_trio.output.meryl_db, - hap1 = rules.cp_hap_trio.output.hap1, - hap2 = rules.cp_hap_trio.output.hap2 - output: - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury_trio.{id}_hap1.purged.100_20000.phased_block.stats", - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury_trio.{id}_hap2.purged.100_20000.phased_block.stats", - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury_trio.{id}_hap1.purged.block.N.png", - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury_trio.{id}_hap2.purged.block.N.png", - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury_trio.qv", - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury_trio.completeness.stats", - res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury/{id}_purge_merqury_trio.hapmers.blob.png" - params: - path = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/merqury", - prefix = "{id}_purge_merqury_trio" - benchmark: - res_path + "/{runid}/benchmark/{id}_merqury_trio_purged.txt" - threads: 20 - resources: - mem_mb=60000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/merqury1.3" - shell: - "cd {params.path} && " - "export MERQURY=/usr/local/share/merqury && " - "merqury.sh {input.read_db} {input.p1} {input.p2} {input.hap1} {input.hap2} {params.prefix}" \ No newline at end of file diff --git a/workflow/rules/06_sym_link_hap.smk b/workflow/rules/06_sym_link_hap.smk deleted file mode 100644 index 27fad12..0000000 --- a/workflow/rules/06_sym_link_hap.smk +++ /dev/null @@ -1,10 +0,0 @@ -rule link_purged_asm: - input: - hap1 = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap1/{id}_hap1.purged.fa", - hap2 = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap2/{id}_hap2.purged.fa" - output: - hap1 = res_path + "/{runid}/{id}_hap1.fa", - hap2 = res_path + "/{runid}/{id}_hap2.fa" - shell: - "ln -s {input.hap1} {output.hap1} && " - "ln -s {input.hap2} {output.hap2}" \ No newline at end of file diff --git a/workflow/rules/03.5_asm_qc_merqury.smk b/workflow/rules/unimplemented.smk similarity index 67% rename from workflow/rules/03.5_asm_qc_merqury.smk rename to workflow/rules/unimplemented.smk index 7e3747b..e3e0f44 100644 --- a/workflow/rules/03.5_asm_qc_merqury.smk +++ b/workflow/rules/unimplemented.smk @@ -1,58 +1,37 @@ -######### MERQURY -### create reads db necessary for merqury -rule meryl: +### QC on .bam files with LongQC +rule longqc: input: - get_fasta + abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{Bid}.bam" output: - directory(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_reads-db_k21.meryl") + directory(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC") benchmark: - res_path + "/{runid}/benchmark/{id}_meryl.txt" - threads: 20 + res_path + "/{Bid}/{run}/benchmark/longqc.txt" + priority: 1 + threads: 8 resources: mem_mb=60000 container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/merqury1.3" - shell: - "meryl k=21 count {input} output {output}" - -### temporary haplotype copy used by merqury -rule cp_hap: - input: - hap1=rules.hap_gfa_to_fasta.output.hap1_fa, - hap2=rules.hap_gfa_to_fasta.output.hap2_fa - output: - hap1=temp(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_hap1.fa.gz"), - hap2=temp(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_hap2.fa.gz") - params: - path = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury" + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/longqc1.2.0c" shell: - "cp {{{input.hap1},{input.hap2}}} {params.path}" + "longQC sampleqc -x pb-hifi -o {output} {input}" -### assembly quality -rule merqury: +### QC on .fastq.gz files with FastQC +rule fastqc: input: - read_db = rules.meryl.output, - hap1 = rules.cp_hap.output.hap1, - hap2 = rules.cp_hap.output.hap2 + get_fastq output: - qv = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury.qv", - stat = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury.completeness.stats" + multiext(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc", ".html", ".zip") params: - prefix = "{id}_merqury", - path = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury", + output_path=res_path + "/{Fid}/{run}//01_raw_data_QC/01_fastQC/" benchmark: - res_path + "/{runid}/benchmark/{id}_merqury.txt" - threads: 20 - resources: - mem_mb=60000 + res_path + "/{Fid}/{run}/benchmark/fastqc.txt" + priority: 1 + threads: 4 container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/merqury1.3" + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/fastqc:0.12.1" shell: - "cd {params.path} && " - "export MERQURY=/usr/local/share/merqury && " - "merqury.sh {input.read_db} {input.hap1} {input.hap2} {params.prefix}" + "fastqc -o {params.output_path} {input}" -######### MERQURY TRIO rule meryl_trio: input: p1 = get_p1, -- GitLab From 255eee1e5e461c68036c19f289d90c93a6ebabfa Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 16:46:13 +0100 Subject: [PATCH 085/178] remove obsolete version of the functions --- workflow/scripts/from_config/hifiasm_mode.py | 48 -------------------- workflow/scripts/from_config/parameters.py | 43 ------------------ 2 files changed, 91 deletions(-) delete mode 100644 workflow/scripts/from_config/hifiasm_mode.py delete mode 100644 workflow/scripts/from_config/parameters.py diff --git a/workflow/scripts/from_config/hifiasm_mode.py b/workflow/scripts/from_config/hifiasm_mode.py deleted file mode 100644 index 82c2e7c..0000000 --- a/workflow/scripts/from_config/hifiasm_mode.py +++ /dev/null @@ -1,48 +0,0 @@ -########### GET THE ASSEMBLY MODE ########### -def get_mode(wildcards): - id_name = wildcards.id - mode = config[f'{id_name}']["mode"] - return(mode) - -def get_mode_hap1(wildcards): - id_name = wildcards.id - mode = config[f'{id_name}']["mode"] - if mode == "hi-c": - return(str(rules.hifiasm_hic.output.hap1)) - elif mode == "trio": - return(str(rules.hifiasm_trio.output.hap1)) - elif mode == "default": - return(str(rules.hifiasm.output.hap1)) - -def get_mode_hap2(wildcards): - id_name = wildcards.id - mode = config[f'{id_name}']["mode"] - if mode == "hi-c": - return(str(rules.hifiasm_hic.output.hap2)) - elif mode == "trio": - return(str(rules.hifiasm_trio.output.hap2)) - elif mode == "default": - return(str(rules.hifiasm.output.hap2)) - -########### GET MODE REQUIRED FILES ########### -#### HI-C MODE -def get_r1(wildcards): - id = wildcards.id - r1 = config[f'{id}']["r1"] - return r1 - -def get_r2(wildcards): - id = wildcards.id - r2 = config[f'{id}']["r2"] - return r2 - -#### TRIO MODE -def get_p1(wildcards): - id = wildcards.id - p1 = config[f'{id}']["p1"] - return p1 - -def get_p2(wildcards): - id = wildcards.id - p2 = config[f'{id}']["p2"] - return p2 \ No newline at end of file diff --git a/workflow/scripts/from_config/parameters.py b/workflow/scripts/from_config/parameters.py deleted file mode 100644 index 442ae05..0000000 --- a/workflow/scripts/from_config/parameters.py +++ /dev/null @@ -1,43 +0,0 @@ -from snakemake.io import expand - -########### GET PARAMETERS FROM CONFIG ########### -#### BUSCO LINEAGE -def get_busco_lin(wildcards): - id_name = wildcards.id - lineage = config[f'{id_name}']["busco_lineage"] - return(lineage) - -#### Ploidy -def get_ploidy(wildcards): - id_name = wildcards.id - try : - ploidy = config[f'{id_name}']["ploidy"] - except KeyError: - return 2 - return ploidy - -#### RUN NAME -def get_run(wildcards): - id_name = wildcards.id - run = config[f'{id_name}']["run"] - return(run) - -#### FASTA -def get_fasta(wildcards): - id_name = wildcards.id - fa = config[f'{id_name}']["fasta"] - return(fa) - -#### FASTQ -def get_fastq(wildcards): - id_name = wildcards.Fid - fq = config[f'{id_name}']["fastq"] - return(fq) - -#### BAM -def get_bam(wildcards): - id_name = wildcards.Bid - fq = config[f'{id_name}']["bam"] - return(fq) - - -- GitLab From 529898a55893a243f3d6475e83a5a4c6d82910cb Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 16:48:53 +0100 Subject: [PATCH 086/178] relocate documentation out of the scripts directory --- {workflow/doc => doc}/Assembly-Mode/Hi-C-tutorial.md | 0 {workflow/doc => doc}/Assembly-Mode/Trio-tutorial.md | 0 {workflow/doc => doc}/Going-further.md | 0 {workflow/doc => doc}/Known-errors.md | 0 {workflow/doc => doc}/Outputs.md | 0 {workflow/doc => doc}/Programs.md | 0 {workflow/doc => doc}/Quick-start.md | 0 {workflow/doc => doc}/Tar-data-preparation.md | 0 {workflow => doc}/documentation.md | 0 {workflow/doc => doc}/fig/rule_dag.svg | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename {workflow/doc => doc}/Assembly-Mode/Hi-C-tutorial.md (100%) rename {workflow/doc => doc}/Assembly-Mode/Trio-tutorial.md (100%) rename {workflow/doc => doc}/Going-further.md (100%) rename {workflow/doc => doc}/Known-errors.md (100%) rename {workflow/doc => doc}/Outputs.md (100%) rename {workflow/doc => doc}/Programs.md (100%) rename {workflow/doc => doc}/Quick-start.md (100%) rename {workflow/doc => doc}/Tar-data-preparation.md (100%) rename {workflow => doc}/documentation.md (100%) rename {workflow/doc => doc}/fig/rule_dag.svg (100%) diff --git a/workflow/doc/Assembly-Mode/Hi-C-tutorial.md b/doc/Assembly-Mode/Hi-C-tutorial.md similarity index 100% rename from workflow/doc/Assembly-Mode/Hi-C-tutorial.md rename to doc/Assembly-Mode/Hi-C-tutorial.md diff --git a/workflow/doc/Assembly-Mode/Trio-tutorial.md b/doc/Assembly-Mode/Trio-tutorial.md similarity index 100% rename from workflow/doc/Assembly-Mode/Trio-tutorial.md rename to doc/Assembly-Mode/Trio-tutorial.md diff --git a/workflow/doc/Going-further.md b/doc/Going-further.md similarity index 100% rename from workflow/doc/Going-further.md rename to doc/Going-further.md diff --git a/workflow/doc/Known-errors.md b/doc/Known-errors.md similarity index 100% rename from workflow/doc/Known-errors.md rename to doc/Known-errors.md diff --git a/workflow/doc/Outputs.md b/doc/Outputs.md similarity index 100% rename from workflow/doc/Outputs.md rename to doc/Outputs.md diff --git a/workflow/doc/Programs.md b/doc/Programs.md similarity index 100% rename from workflow/doc/Programs.md rename to doc/Programs.md diff --git a/workflow/doc/Quick-start.md b/doc/Quick-start.md similarity index 100% rename from workflow/doc/Quick-start.md rename to doc/Quick-start.md diff --git a/workflow/doc/Tar-data-preparation.md b/doc/Tar-data-preparation.md similarity index 100% rename from workflow/doc/Tar-data-preparation.md rename to doc/Tar-data-preparation.md diff --git a/workflow/documentation.md b/doc/documentation.md similarity index 100% rename from workflow/documentation.md rename to doc/documentation.md diff --git a/workflow/doc/fig/rule_dag.svg b/doc/fig/rule_dag.svg similarity index 100% rename from workflow/doc/fig/rule_dag.svg rename to doc/fig/rule_dag.svg -- GitLab From ff1a66906164ab1388b87bf8616197c6317b5a84 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 17:13:21 +0100 Subject: [PATCH 087/178] remove obsolete snakefile --- workflow/Snakefile_obs.smk | 132 ------------------------------------- 1 file changed, 132 deletions(-) delete mode 100644 workflow/Snakefile_obs.smk diff --git a/workflow/Snakefile_obs.smk b/workflow/Snakefile_obs.smk deleted file mode 100644 index 6135456..0000000 --- a/workflow/Snakefile_obs.smk +++ /dev/null @@ -1,132 +0,0 @@ -configfile: ".config/masterconfig.yaml" - -# Include all the scripts -include: "scripts/from_config/hifiasm_mode.py" -include: "scripts/from_config/parameters.py" -include: "scripts/from_config/target_list.py" -include: "scripts/path_helper.py" - -# Get paths to the WD -if config["root"].startswith("."): - abs_root_path = get_abs_root_path() - res_path = get_res_path() -else: - abs_root_path = config["root"] - res_path = abs_root_path + "/" + config["resdir"] - -# Include all the rules -include: "rules/01_qc.smk" -include: "rules/02_asm.smk" -include: "rules/03_asm_qc.smk" -include: "rules/03.5_asm_qc_merqury.smk" -include: "rules/04_purge_dups.smk" -include: "rules/05_purged_asm_qc.smk" -include: "rules/05.5_purged_asm_qc_merqury.smk" -include: "rules/06_sym_link_hap.smk" -include: "rules/07_report.smk" -include: "rules/00_runtime.smk" - -# Get the filenames of inputs -IDS=config["IDS"] -bamIDS=check_bam(IDS) -fastqIDS=check_fastq(IDS) - -RUNID = run_id(config["IDS"]) -BID_RUN = run_BFid(bamIDS) -FID_RUN = run_BFid(fastqIDS) - -# Create the list of desired outputs -## For raw data -longqc_output = expand(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC", zip, - run=BID_RUN, Bid=bamIDS), -fastqc_output = expand(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc.{ext}", zip, - run=FID_RUN, Fid=fastqIDS, ext=["html", "zip"]) - -## Reports -REP_ID = for_report(IDS) -RUNID_REG = run_id(REP_ID) -BUSCO_LIN = busco_lin(REP_ID) -### We create additional lists for purge_dups applications -PURGE_ID = for_purge(IDS) -RUNID_PURGE = run_id(PURGE_ID) -BUSCO_LIN_PURGE = busco_lin(PURGE_ID) - -purged_report_output = expand(res_path + "/{runid}/p_report_{id}.{lin}.html", zip, - runid=RUNID_PURGE, id=PURGE_ID, lin = BUSCO_LIN_PURGE ) - -report_output = expand(res_path + "/{runid}/report_{id}.{lin}.html", zip, - runid=RUNID_REG, id=REP_ID, lin = BUSCO_LIN) - -### Same thing for trio -REP_TRIO_ID = for_report(IDS, trio = True) -RUNID_TRIO = run_id(REP_TRIO_ID) -BUSCO_LIN_TRIO = busco_lin(REP_TRIO_ID) -### We create additional lists for purge_dups applications -PURGE_ID_TRIO = for_purge(IDS, trio = True) -RUNID_PURGE_TRIO = run_id(PURGE_ID) -BUSCO_LIN_TRIO_PURGE = busco_lin(PURGE_ID_TRIO) - -purged_report_trio_output = expand(res_path + "/{runid}/p_report_trio_{id}.{lin}.html", zip, - runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin = BUSCO_LIN_TRIO_PURGE) - -report_trio_output = expand(res_path + "/{runid}/report_trio_{id}.{lin}.html", zip, - runid=RUNID_TRIO, id=REP_TRIO_ID, lin = BUSCO_LIN_TRIO) - -# Add symbolic link to final assembly -symb_link1 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, - runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"]) -symb_link2 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, - runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"]) -## PURGE_DUPS CUTOFFS GRAPH -cut_eval1 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, - runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"]) -cut_eval2 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, - runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"]) - -## BUSCO -busco_reg = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt", zip, - runid=RUNID_REG, id=REP_ID, n=["1", "2"], lin = BUSCO_LIN) - -busco_purged_reg = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", zip, - runid=RUNID_PURGE, id=PURGE_ID, n=["1", "2"], lin = BUSCO_LIN) - -busco_trio = expand(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/busco/{id}_hap{n}/short_summary.specific.{lin}.{id}_hap{n}.txt", zip, - runid=RUNID_TRIO, id=REP_TRIO_ID, n=["1", "2"], lin = BUSCO_LIN_TRIO) -busco_purged_trio = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC/busco/{id}_purged_hap{n}/short_summary.specific.{lin}.{id}_purged_hap{n}.txt", zip, - runid=RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, n=["1", "2"], lin = BUSCO_LIN_TRIO) - -## RUNTIME -time = expand(res_path + "/{runid}/runtime.{id}.{lin}.txt", zip, - runid = RUNID_REG, id=REP_ID, lin=BUSCO_LIN) -time_trio = expand(res_path + "/{runid}/runtime_trio.{id}.{lin}.txt", zip, - runid = RUNID_TRIO, id=REP_TRIO_ID, lin=BUSCO_LIN_TRIO) -time_purge = expand(res_path + "/{runid}/p_runtime.{id}.{lin}.txt", zip, - runid = RUNID_PURGE, id=PURGE_ID, lin=BUSCO_LIN) -time_trio_purge = expand(res_path + "/{runid}/p_runtime_trio.{id}.{lin}.txt", zip, - runid = RUNID_PURGE_TRIO, id=PURGE_ID_TRIO, lin=BUSCO_LIN_TRIO) - -rule_all_input_list = [ - longqc_output, - fastqc_output, - cut_eval1, - cut_eval2, - symb_link1, - symb_link2, - report_output, - report_trio_output, - busco_reg, - busco_purged_reg, - busco_trio, - busco_purged_trio, - time, - time_trio, - time_trio_purge, - time_purge, - purged_report_output, - purged_report_trio_output -] - -#### target files -rule all: - input: - all_input = rule_all_input_list \ No newline at end of file -- GitLab From 358d5957ea336762dabce1c376a0df877b1155fd Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 8 Jan 2025 17:13:46 +0100 Subject: [PATCH 088/178] Add dynamic rule for LTR analysis --- workflow/Snakefile | 54 ++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index c75c544..6f13500 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -152,7 +152,7 @@ rule unpigz_to_fasta: input: rules.haplotigs_handling.output.hap output: - hap_unziped = temp(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fa")) + temp(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fa")) container: f"{container_registry}/pigz" threads: 4 @@ -281,6 +281,8 @@ rule meryl: shell: "meryl k={params.km_size} threads={threads} count {input} output {output}" +# NOT TESTED +# Calculates metrics like QV and completeness, providing a quantitative assessment of the genome assembly. rule merqury: input: km_database = rules.meryl.output, @@ -310,35 +312,45 @@ rule merqury: merqury.sh {input.read_db} {params.dirr}/tmp_hap1.fasta.gz {params.dirr}/tmp_hap1.fasta.gz {params.prefix} """ + rule LTR_finder: input: - rules.unzip_hap_fasta.output + rules.unpigz_to_fasta.output output: - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.scn" - singularity: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/ltr_finder:latest" + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "{sample}_hap{n}.scn") + resources: + mem_mb=60000, + time="50:00:00" + container: + f"{container_registry}/ltr_finder:latest" shell: "ltr_finder -C {input} > {output}" rule LTR_retriever: input: - scn=rules.LTR_finder.output, - genome=rules.unzip_hap_fasta.output + scn = rules.LTR_finder.output, + hap = ules.unpigz_to_fasta.output output: - lai=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/{id}_hap{n}.out.LAI", - recap=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/LAI/recap_{id}_hap{n}.tbl" + lai = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "{sample}_hap{n}.out.LAI"), + recap = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl") params: - prefix="{id}_hap{n}" - threads: 10 - singularity: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/ltr_retriever:3.0.1" + prefix="{sample}_hap{n}", + dirr = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR") + threads: 20 + resources: + mem_mb=250000, + time="50:00:00" + container: + f"{container_registry}/ltr_retriever:3.0.1" shell: - 'export PATH="/opt/LTR_retriever:$PATH" && ' - 'LTR_retriever -threads {threads} -genome {input.genome} -infinder {input.scn} && ' - 'mv {params.prefix}.fa.out.LAI {output.lai} && ' - 'mv {params.prefix}.fa.tbl {output.recap} && ' - 'rm {params.prefix}.fa?* && ' - 'rm -rf .RepeatMaskerCache &&' - 'rm {params.prefix}.fa' - + ''' + export PATH="/opt/LTR_retriever:$PATH" && + cd {params.dirr} && + LTR_retriever -threads {threads} -genome {input.hap} -infinder {input.scn} && + mv {params.prefix}.fa.out.LAI {output.lai} && + mv {params.prefix}.fa.tbl {output.recap} && + rm {params.prefix}.fa?* && + rm -rf .RepeatMaskerCache && + rm {params.prefix}.fa + ''' \ No newline at end of file -- GitLab From 1d1ceb7df08ef49e5bdba64dccdcc17e3dc0b7b5 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 09:35:46 +0100 Subject: [PATCH 089/178] Add the sample to the log output --- workflow/scripts/parameter_retrieval.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py index 5a0bde8..a3963b0 100644 --- a/workflow/scripts/parameter_retrieval.py +++ b/workflow/scripts/parameter_retrieval.py @@ -6,7 +6,7 @@ def get_purge_force(wildcards): try : force = config["samples"][wildcards.sample]["assembly_purge_force"] except KeyError: - print('Asm4pg -> No "assembly_purge_force" specified, using l3 by default') + print('Asm4pg -> "assembly_purge_force" unspecified for ' + wildcards.sample + ', using l3 by default') return '3' return force @@ -15,7 +15,7 @@ def get_mode(wildcards): try : mode = config["samples"][wildcards.sample]["mode"] except KeyError: - print('Asm4pg -> No "mode" specified, using default assembly mode for hifiasm') + print('Asm4pg -> "mode" unspecified for ' + wildcards.sample + ', using default assembly mode for hifiasm') return 'default' return mode @@ -32,7 +32,7 @@ def get_purge_bool(wildcards): try : purge_bool = config["samples"][wildcards.sample]["run_purge_dups"] except KeyError: - print('Asm4pg -> "run_purge_dups" unspecified, using "False" by default') + print('Asm4pg -> "run_purge_dups" unspecified for ' + wildcards.sample + ', using "False" by default') return False return purge_bool @@ -40,7 +40,7 @@ def get_busco_lin(wildcards) -> str: try : lin = config["samples"][wildcards.sample]["busco_lineage"] except KeyError: - print('Asm4pg -> "busco_lineage" unspecified for ' + config["samples"][wildcards.sample]+ ', using "eukaryota_odb10" by default') + print('Asm4pg -> "busco_lineage" unspecified for ' + wildcards.sample + ', using "eukaryota_odb10" by default') return "eukaryota_odb10" return lin @@ -48,7 +48,7 @@ def get_ploidy(wildcards) -> int: try : ploidy = config["samples"][wildcards.sample]["ploidy"] except KeyError: - print('Asm4pg -> "ploidy" unspecified for ' + config["samples"][wildcards.sample]+ ', using 2 by default') + print('Asm4pg -> "ploidy" unspecified for ' + wildcards.sample + ', using 2 by default') return 2 return ploidy @@ -56,6 +56,6 @@ def get_kmer_size(wildcards) -> int: try : size = config["samples"][wildcards.sample]["kmer_size"] except KeyError: - print('Asm4pg -> "kmer_size" unspecified for ' + config["samples"][wildcards.sample]+ ', using 21 by default') + print('Asm4pg -> "kmer_size" unspecified for ' + wildcards.sample + ', using 21 by default') return 21 return size \ No newline at end of file -- GitLab From 3559c08f8d3273fade07c577c8fcc9d81d19d7e8 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 09:35:58 +0100 Subject: [PATCH 090/178] ignore the busco files --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d831bde..5693717 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,5 @@ node_modules node_modules/* !slurm_logs/* !workflow/* -.snakemake \ No newline at end of file +.snakemake +busco_downloads \ No newline at end of file -- GitLab From 01ad33108cb33870a51342a579c4b03bd3cebb24 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 09:36:56 +0100 Subject: [PATCH 091/178] Remove file, targeting now automatic --- workflow/scripts/from_config/target_list.py | 44 --------------------- 1 file changed, 44 deletions(-) delete mode 100644 workflow/scripts/from_config/target_list.py diff --git a/workflow/scripts/from_config/target_list.py b/workflow/scripts/from_config/target_list.py deleted file mode 100644 index 1d37a96..0000000 --- a/workflow/scripts/from_config/target_list.py +++ /dev/null @@ -1,44 +0,0 @@ -from snakemake.io import expand -import os - -########### FOR TARGET RULE ########### -#### CREATE RUN+ID LIST -def run_id(id_list): - run_list = [] - for i in id_list: - run = config[i]["run"] - run_list.append(i + "/" + run) - RUNID = expand("{runid}", runid = run_list) - return(RUNID) - -def run_BFid(id_list): - run_list = [] - for i in id_list: - run = config[i]["run"] - run_list.append(run) - RUNID = expand("{runid}", runid = run_list) - return(RUNID) - -# Create a list of purge datasets -def for_purge(id_list, trio =False): - NAME = [] - for i in id_list: - mode = config[i]["mode"] - if mode == "trio" and trio and config[i]["run_purge_dups"]: - NAME.append(i) - elif trio == False and config[i]["run_purge_dups"]: - NAME.append(i) - return(NAME) - -# Create a list of not_purged datasets -def for_report(id_list, trio =False): - NAME = [] - for i in id_list: - mode = config[i]["mode"] - if mode == "trio" and trio and config[i]["run_purge_dups"]==False: - NAME.append(i) - elif trio == False and config[i]["run_purge_dups"]==False: - NAME.append(i) - return(NAME) - - -- GitLab From e2adad7412a82815172f42538b8220b9504a1927 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 09:37:14 +0100 Subject: [PATCH 092/178] Fix pigs fasta output --- workflow/Snakefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 6f13500..f3fcbd2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -10,8 +10,7 @@ import yaml container_registry = config.get("container_registry", "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg") output_dir = config.get("output_dir", "results/") - - + rule all: input: # Required final assemblies and graphs @@ -30,6 +29,10 @@ rule all: expand( os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "short_summary.specific.{sample}_hap{n}.txt"), sample=config["samples"].keys(), n=[1, 2] + ), + expand( + os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), + sample=config["samples"].keys() ) # Genome assembly using hifiasm @@ -146,13 +149,12 @@ use rule genometools_on_raw_data as genometools_on_assembly with: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt") priority: 0 -# NOT TESTED # Rule to unsip the haplotypes if needed for other rules rule unpigz_to_fasta: input: rules.haplotigs_handling.output.hap output: - temp(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fa")) + temp(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fasta")) container: f"{container_registry}/pigz" threads: 4 @@ -329,7 +331,7 @@ rule LTR_finder: rule LTR_retriever: input: scn = rules.LTR_finder.output, - hap = ules.unpigz_to_fasta.output + hap = rules.unpigz_to_fasta.output output: lai = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "{sample}_hap{n}.out.LAI"), recap = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl") -- GitLab From 89abd9b086313134846d6fa6c8c6a5953281dea2 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 11:31:36 +0100 Subject: [PATCH 093/178] relocate file --- workflow/{rules => }/unimplemented.smk | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) rename workflow/{rules => }/unimplemented.smk (68%) diff --git a/workflow/rules/unimplemented.smk b/workflow/unimplemented.smk similarity index 68% rename from workflow/rules/unimplemented.smk rename to workflow/unimplemented.smk index e3e0f44..c6be40b 100644 --- a/workflow/rules/unimplemented.smk +++ b/workflow/unimplemented.smk @@ -1,4 +1,16 @@ ### QC on .bam files with LongQC +rule multiqc: + output: + res_path + "/{runid}/multiqc/{id}_multiqc.html" + params: + indir = res_path + "/{runid}", + name = "{id}_multiqc", + out = res_path + "/{runid}/multiqc" + container: + "docker://ewels/multiqc" + shell: + "multiqc {params.indir} --filename {params.name} --outdir {params.out} --ignore \"*multiqc*\" -d -dd 1 -f" + rule longqc: input: abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{Bid}.bam" @@ -106,3 +118,38 @@ rule merqury_trio: "$MERQURY/trio/hapmers.sh {input.p1} {input.p2} {input.read_db} && " "merqury.sh {input.read_db} {output.p1_hapmer} {output.p2_hapmer} {input.hap1} {input.hap2} {params.prefix}" +rule no_purge_report_trio: + input: + # Reads QC + genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", + gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", + # Hifiasm assembly QC + gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", + gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", + busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", + busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", + kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", + kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", + tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", + tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", + merq_comp=ASM_QC + "/merqury/{id}_merqury_trio.completeness.stats", + merq_err=ASM_QC + "/merqury/{id}_merqury_trio.qv", + merq_blob=ASM_QC + "/merqury/{id}_merqury_trio.hapmers.blob.png", + merq_block_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.block.N.png", + merq_block_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.block.N.png", + merq_block_stats_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.100_20000.phased_block.stats", + merq_block_stats_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.100_20000.phased_block.stats" + output: + res_path + "/{runid}/{id}/{lin}/report_trio.html" + params: + id="{id}", # get filename + mode=get_mode, # get assembly mode + p1=get_p1, + p2=get_p2, + run=get_run, + purge=get_purge, + purge_force = get_purge_force + container: + "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" + script: + "../scripts/report_trio.Rmd" \ No newline at end of file -- GitLab From 6f15a3458e67d35411dd7e9840ba4371d5b8e098 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 11:32:01 +0100 Subject: [PATCH 094/178] remove obsolete smk file --- workflow/rules/07_report.smk | 216 ----------------------------------- 1 file changed, 216 deletions(-) delete mode 100644 workflow/rules/07_report.smk diff --git a/workflow/rules/07_report.smk b/workflow/rules/07_report.smk deleted file mode 100644 index 2f7ed89..0000000 --- a/workflow/rules/07_report.smk +++ /dev/null @@ -1,216 +0,0 @@ -### create report at the end of the workflow - -# path variables -RAW_QC = res_path + "/{runid}/01_raw_data_QC" -ASM_QC = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC" -P_ASM_QC = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC" - -rule report: - input: - genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", - gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", - gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", - gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", - busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", - busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", - kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", - kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", - tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", - tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", - LRT_recap_1=ASM_QC + "/LAI/recap_{id}_hap1.tbl", - LAI_1=ASM_QC + "/LAI/{id}_hap1.out.LAI", - LRT_recap_2=ASM_QC + "/LAI/recap_{id}_hap2.tbl", - LAI_2=ASM_QC + "/LAI/{id}_hap2.out.LAI", - merq_comp=rules.merqury.output.stat, - merq_err=rules.merqury.output.qv, - P_gt_asm_1 = P_ASM_QC + "/assembly_stats/{id}_purged_hap1.AStats.txt", - P_gt_asm_2 = P_ASM_QC + "/assembly_stats/{id}_purged_hap2.AStats.txt", - P_busco_1 = P_ASM_QC + "/busco/{id}_purged_hap1/short_summary.specific.{lin}.{id}_purged_hap1.txt", - P_busco_2 = P_ASM_QC + "/busco/{id}_purged_hap2/short_summary.specific.{lin}.{id}_purged_hap2.txt", - P_kplot_1 = P_ASM_QC + "/katplot/hap1/{id}_purged_hap1.katplot.png", - P_kplot_2 = P_ASM_QC + "/katplot/hap2/{id}_purged_hap2.katplot.png", - P_tel_1 = P_ASM_QC + "/telomeres/{id}_hap1_purged_telomeres.txt", - P_tel_2 = P_ASM_QC + "/telomeres/{id}_hap2_purged_telomeres.txt", - P_LRT_recap_1 = P_ASM_QC + "/LAI/purge_recap_{id}_hap1.tbl", - P_LAI_1 = P_ASM_QC + "/LAI/purge_{id}_hap1.out.LAI", - P_LRT_recap_2 = P_ASM_QC + "/LAI/purge_recap_{id}_hap2.tbl", - P_LAI_2 = P_ASM_QC + "/LAI/purge_{id}_hap2.out.LAI", - P_merq_comp = rules.purge_merqury.output.stat, - P_merq_err = rules.purge_merqury.output.qv - output: - res_path + "/{runid}/{id}/{lin}/report.html" - params: - id="{id}", - mode=get_mode, - run=get_run, - purge=get_purge, - purge_force = get_purge_force, - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" - script: - "../scripts/report.Rmd" - - -rule rename_report: - input: - rules.report.output - output: - res_path + "/{runid}/p_report_{id}.{lin}.html" - shell: - "mv {input} {output}" - -rule no_purge_report: - input: - # Reads QC - genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", - gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", - # Hifiasm assembly QC - gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", - gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", - busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", - busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", - kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", - kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", - tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", - tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", - LRT_recap_1=ASM_QC + "/LAI/recap_{id}_hap1.tbl", - LAI_1=ASM_QC + "/LAI/{id}_hap1.out.LAI", - LRT_recap_2=ASM_QC + "/LAI/recap_{id}_hap2.tbl", - LAI_2=ASM_QC + "/LAI/{id}_hap2.out.LAI", - merq_comp=rules.merqury.output.stat, - merq_err=rules.merqury.output.qv - output: - res_path + "/{runid}/{id}/{lin}/report.html" - params: - id="{id}", - mode=get_mode, - run=get_run, - purge=get_purge, - purge_force = get_purge_force - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" - script: - "../scripts/report.Rmd" - -rule rename_no_purge_report: - input: - rules.no_purge_report.output - output: - res_path + "/{runid}/report_{id}.{lin}.html" - shell: - "mv {input} {output}" - -rule report_trio: - input: - # Reads QC - genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", - gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", - # Hifiasm assembly QC - gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", - gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", - busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", - busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", - kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", - kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", - tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", - tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", - merq_comp=ASM_QC + "/merqury/{id}_merqury_trio.completeness.stats", - merq_err=ASM_QC + "/merqury/{id}_merqury_trio.qv", - merq_blob=ASM_QC + "/merqury/{id}_merqury_trio.hapmers.blob.png", - merq_block_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.block.N.png", - merq_block_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.block.N.png", - merq_block_stats_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.100_20000.phased_block.stats", - merq_block_stats_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.100_20000.phased_block.stats", - P_gt_asm_1=P_ASM_QC + "/assembly_stats/{id}_purged_hap1.AStats.txt", - P_gt_asm_2=P_ASM_QC + "/assembly_stats/{id}_purged_hap2.AStats.txt", - P_busco_1=P_ASM_QC + "/busco/{id}_purged_hap1/short_summary.specific.{lin}.{id}_purged_hap1.txt", - P_busco_2=P_ASM_QC + "/busco/{id}_purged_hap2/short_summary.specific.{lin}.{id}_purged_hap2.txt", - P_kplot_1=P_ASM_QC + "/katplot/hap1/{id}_purged_hap1.katplot.png", - P_kplot_2=P_ASM_QC + "/katplot/hap2/{id}_purged_hap2.katplot.png", - P_tel_1=P_ASM_QC + "/telomeres/{id}_hap1_purged_telomeres.txt", - P_tel_2=P_ASM_QC + "/telomeres/{id}_hap2_purged_telomeres.txt", - P_merq_comp=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.completeness.stats", - P_merq_err=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.qv", - P_merq_blob=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.hapmers.blob.png", - P_merq_block_1=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap1.purged.block.N.png", - P_merq_block_2=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap2.purged.block.N.png", - P_merq_block_stats_1=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap1.purged.100_20000.phased_block.stats", - P_merq_block_stats_2=P_ASM_QC + "/merqury/{id}_purge_merqury_trio.{id}_hap2.purged.100_20000.phased_block.stats" - output: - res_path + "/{runid}/{id}/{lin}/report_trio.html" - params: - id="{id}", # get filename - mode=get_mode, # get assembly mode - p1=get_p1, - p2=get_p2, - run=get_run, - purge=get_purge, - purge_force = get_purge_force - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" - script: - "../scripts/report_trio.Rmd" - -rule rename_report_trio: - input: - rules.report_trio.output - output: - res_path + "/{runid}/p_report_trio_{id}.{lin}.html" - shell: - "mv {input} {output}" - -rule no_purge_report_trio: - input: - # Reads QC - genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", - gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", - # Hifiasm assembly QC - gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", - gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", - busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", - busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", - kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", - kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", - tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", - tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", - merq_comp=ASM_QC + "/merqury/{id}_merqury_trio.completeness.stats", - merq_err=ASM_QC + "/merqury/{id}_merqury_trio.qv", - merq_blob=ASM_QC + "/merqury/{id}_merqury_trio.hapmers.blob.png", - merq_block_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.block.N.png", - merq_block_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.block.N.png", - merq_block_stats_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.100_20000.phased_block.stats", - merq_block_stats_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.100_20000.phased_block.stats" - output: - res_path + "/{runid}/{id}/{lin}/report_trio.html" - params: - id="{id}", # get filename - mode=get_mode, # get assembly mode - p1=get_p1, - p2=get_p2, - run=get_run, - purge=get_purge, - purge_force = get_purge_force - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" - script: - "../scripts/report_trio.Rmd" - -rule no_purge_rename_report_trio: - input: - rules.no_purge_report_trio.output - output: - res_path + "/{runid}/report_trio_{id}.{lin}.html" - shell: - "mv {input} {output}" - -rule multiqc: - output: - res_path + "/{runid}/multiqc/{id}_multiqc.html" - params: - indir = res_path + "/{runid}", - name = "{id}_multiqc", - out = res_path + "/{runid}/multiqc" - container: - "docker://ewels/multiqc" - shell: - "multiqc {params.indir} --filename {params.name} --outdir {params.out} --ignore \"*multiqc*\" -d -dd 1 -f" \ No newline at end of file -- GitLab From 160321a854f21b6c3fbb3001785b630278599d7d Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 11:32:32 +0100 Subject: [PATCH 095/178] add rule to generate the report --- workflow/Snakefile | 90 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 17 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index f3fcbd2..d91a8bd 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -5,6 +5,7 @@ configfile: ".config/masterconfig.yaml" include: "scripts/parameter_retrieval.py" + import os import yaml @@ -107,6 +108,22 @@ rule haplotigs_handling: ./workflow/scripts/haplotigs_handling.sh {params.purge_dups_option} {input.hap_fasta} {output.hap} {params.prefix} {input.reads} {params.dirr} """ +# Rule to unsip the haplotypes if needed for other rules +rule unpigz_to_fasta: + input: + rules.haplotigs_handling.output.hap + output: + temp(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fasta")) + container: + f"{container_registry}/pigz" + threads: 4 + resources: + mem_mb=25000, + time="10:00:00" + shell: + "unpigz -k -p 1 {input}" + + # Make purge_dups cutoffs graph rule cutoffs_graph: input: @@ -149,21 +166,6 @@ use rule genometools_on_raw_data as genometools_on_assembly with: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt") priority: 0 -# Rule to unsip the haplotypes if needed for other rules -rule unpigz_to_fasta: - input: - rules.haplotigs_handling.output.hap - output: - temp(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","{sample}_final_hap{n}.fasta")) - container: - f"{container_registry}/pigz" - threads: 4 - resources: - mem_mb=25000, - time="10:00:00" - shell: - "unpigz -k -p 1 {input}" - # NOT TESTED # BUSCO stats on assembly (may not work on first run, rerun the WF) rule busco: @@ -314,7 +316,8 @@ rule merqury: merqury.sh {input.read_db} {params.dirr}/tmp_hap1.fasta.gz {params.dirr}/tmp_hap1.fasta.gz {params.prefix} """ - +# NOT TESTED +# Identifies LTR retrotransposons rule LTR_finder: input: rules.unpigz_to_fasta.output @@ -328,6 +331,8 @@ rule LTR_finder: shell: "ltr_finder -C {input} > {output}" +# NOT TESTED +# Calculates the LTR Assembly Index (LAI), a metric for assembly quality based on LTR retrotransposons rule LTR_retriever: input: scn = rules.LTR_finder.output, @@ -355,4 +360,55 @@ rule LTR_retriever: rm -rf .RepeatMaskerCache && rm {params.prefix}.fa ''' - \ No newline at end of file + +# Rule to generate the html report +rule generate_report: + input: + genomescope = rules.genomescope.output.plot, + genometools_on_raw_data = rules.genometools_on_raw_data.output, + + genometools_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), + genometools_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), + + busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "short_summary.specific.{sample}_hap1.txt"), + busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "short_summary.specific.{sample}_hap2.txt"), + + kplot_hap1 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), + kplot_hap2 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), + + telomeres_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "telomeres", "{sample}_hap1_telomeres.txt"), + telomeres_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "telomeres", "{sample}_hap2_telomeres.txt"), + + LAI_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "{sample}_hap1.out.LAI"), + LAI_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "{sample}_hap2.out.LAI"), + + LRT_recap_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "recap_{sample}_hap1.tbl"), + LRT_recap_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "recap_{sample}_hap2.tbl"), + + merqury_stats=rules.merqury.output.stats, + merqury_qv=rules.merqury.output.qv + output: + os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html") + params: + sample = "{sample}", + mode = get_mode, + assembly_purge_force = get_purge_force, + run_purge_dups = get_purge_bool, + busco_lineage = get_busco_lin, + ploidy = get_ploidy, + kmer_size = get_kmer_size, + r1 = lambda wildcards: get_run(wildcards, run=1), + r2 = lambda wildcards: get_run(wildcards, run=2) + container: + f"{container_registry}/rmarkdown4.0.3" + script: + "../scripts/report.Rmd" + +# Rule to relocate the report +rule relocate_report: + input: + rules.generate_report.output + output: + os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html"), + shell: + "mv {input} {output}" \ No newline at end of file -- GitLab From 544d8af5396efe1906fbd95c114a95e131bdb207 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 11:32:51 +0100 Subject: [PATCH 096/178] remove obsolete readme file --- workflow/rules/README.md | 49 ---------------------------------------- 1 file changed, 49 deletions(-) delete mode 100644 workflow/rules/README.md diff --git a/workflow/rules/README.md b/workflow/rules/README.md deleted file mode 100644 index 2d7b436..0000000 --- a/workflow/rules/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# Content of each .smk file -## 01_qc -For raw data QC. Rules include the following programs: -- longQC -- fastQC -- Genometools -- Jellyfish -- GenomeScope - -## 02_asm -For assembly. Rules include different running modes of hifiasm, as well as a rule to obtain a FASTA from the hifiasm GFA output. - -## 03_asm_qc -For assembly QC. Rules include the following programs: -- busco -- kat -- FindTelomeres -- Genometools - -## 03.5_A_qc_merqury -For assembly QC. Rules include the following programs: -- meryl -- merqury -There is the regular merqury and merqury for trio - -## 04_purge_dups -For assembly purging, removal of haplotigs. Include purge_dups - -## 05_purged_asm_qc -For assembly QC after purging. Rules include: -- busco -- kat -- FindTelomeres -- Genometools - -## 05.5_PA_qc_merqury -For assembly QC after purging. Rules include: -- meryl -- merqury -There is the regular merqury and merqury for trio - -## 06_sym_link_hap -For easy acces to purged assemblies. Symbolic link in the run directory. - -## 07_report -For automatic report. Report is in the run directory. Run a R markdown script. -2 versions: -- regular & hi-c -- trio \ No newline at end of file -- GitLab From 44e8b216ab906dfeb1f6ed668b1ea5c297ac3330 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 11:33:07 +0100 Subject: [PATCH 097/178] Update the structure of the report script --- workflow/scripts/report.Rmd | 75 ++++++------------------------------- 1 file changed, 11 insertions(+), 64 deletions(-) diff --git a/workflow/scripts/report.Rmd b/workflow/scripts/report.Rmd index 8dc0ca0..b26f047 100644 --- a/workflow/scripts/report.Rmd +++ b/workflow/scripts/report.Rmd @@ -19,11 +19,17 @@ output: ---- -# `r snakemake@params[["id"]]` - run: `r snakemake@params[["run"]]` -* Run : `r snakemake@params[["run"]]` +# `r snakemake@params[["sample"]]` * Hifiasm mode : `r snakemake@params[["mode"]]` -* Hifiasm purge mode [0-3]: `r snakemake@params[["purge_force"]]` -* Purge conducted: `r if (snakemake@params[["purge"]]) { "Yes" } else { "No" }` +* Hifiasm purge mode : `r snakemake@params[["assembly_purge_force"]]` +* Purge conducted: `r if (snakemake@params[["run_purge_dups"]]) { "Yes" } else { "No" }` +* Busco lineage: `r snakemake@params[["busco_lineage"]]` +* Genome ploidy: `r snakemake@params[["ploidy"]]` +* Kmers size: `r snakemake@params[["kmer_size"]]` + +* If hi-c or trio mode : + * parent1/r1 `r snakemake@params[["r1"]]` + * parent2/r2 `r snakemake@params[["r2"]]` ---- @@ -105,63 +111,4 @@ cat(head(readLines(snakemake@input[["LRT_recap_2"]]), 50), sep = '\n') LAI ```{r comment='', echo=FALSE} cat(head(readLines(snakemake@input[["LAI_2"]]), 2), sep = '\n') -``` - -`r if (snakemake@params[["purge"]]) { "## Assembly QC - After Purge_dups" }` -`r if (snakemake@params[["purge"]]) { "### Assembly statistics" }` -`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_gt_asm_1"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_gt_asm_2"]]), sep = '\n') -``` - -`r if (snakemake@params[["purge"]]) { "### K-mer completeness and error rate" }` -`r if (snakemake@params[["purge"]]) { "Completeness" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_merq_comp"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "Error rate " }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_merq_err"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "### BUSCO" }` -`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_busco_1"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_busco_2"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "### Telomeres" }` -`r if (snakemake@params[["purge"]]) { "Telomeres present in assembly" }` -`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_tel_1"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_tel_2"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "### LTR Assembly Index (LAI) " }` -`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` -`r if (snakemake@params[["purge"]]) { "LTR recap" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(head(readLines(snakemake@input[["P_LRT_recap_1"]]), 50), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "LAI" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(head(readLines(snakemake@input[["P_LAI_1"]]), 2), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` -`r if (snakemake@params[["purge"]]) { "LTR recap" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(head(readLines(snakemake@input[["P_LRT_recap_2"]]), 50), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "LAI" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(head(readLines(snakemake@input[["P_LAI_2"]]), 2), sep = '\n') -``` +``` \ No newline at end of file -- GitLab From 8b84929955bd7c6cefd91fdcfb7655d24df408c3 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 13:27:44 +0100 Subject: [PATCH 098/178] update threads values --- workflow/Snakefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index d91a8bd..2b93a46 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -235,7 +235,6 @@ rule genomescope: ploidy = get_ploidy, km_size = get_kmer_size, dirr = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope"), - threads: 1 resources: mem_mb=40000, time="10:00:00" @@ -399,6 +398,9 @@ rule generate_report: kmer_size = get_kmer_size, r1 = lambda wildcards: get_run(wildcards, run=1), r2 = lambda wildcards: get_run(wildcards, run=2) + resources: + mem_mb=10000, + time="10:00:00" container: f"{container_registry}/rmarkdown4.0.3" script: -- GitLab From 657e8f2831fe6577cdf0c8fa071954fbe21b97c3 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 13:27:57 +0100 Subject: [PATCH 099/178] ignore the cache --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5693717..edcfc05 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,5 @@ node_modules/* !slurm_logs/* !workflow/* .snakemake -busco_downloads \ No newline at end of file +busco_downloads +.cache \ No newline at end of file -- GitLab From f0e2fffb1faa3f63fd0feab5e341faf1438b5492 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 13:28:11 +0100 Subject: [PATCH 100/178] Start to update the readme --- README.md | 91 ++++++++------- doc/Going-further.md | 19 ---- doc/Quick-start.md | 68 ------------ doc/Tar-data-preparation.md | 34 ------ doc/dag.svg | 215 ++++++++++++++++++++++++++++++++++++ 5 files changed, 259 insertions(+), 168 deletions(-) delete mode 100644 doc/Quick-start.md delete mode 100644 doc/Tar-data-preparation.md create mode 100644 doc/dag.svg diff --git a/README.md b/README.md index 8542698..1d3792b 100644 --- a/README.md +++ b/README.md @@ -5,73 +5,70 @@ An automatic and reproducible genome assembly workflow for pangenomic applicatio This workflow uses [Snakemake](https://snakemake.readthedocs.io/en/stable/) to quickly assemble genomes with a HTML report summarizing obtained assembly stats. -A first script (```prejob.sh```) prepares the data until *fasta.gz* files are obtained. A second script (```job.sh```) runs the genome assembly and stats. - -doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/GenomAsm4pg/) - - - -## Table of contents -# Summary - -* [Introduction](README.md) -* [Documentation summary](workflow/documentation.md) - * [Requirements](workflow/documentation.md#asm4pg-requirements) - * [Tutorials](workflow/documentation.md#tutorials) - * [Quick start](workflow/doc/Quick-start.md) - * [Hi-C mode](workflow/doc/Assembly-Mode/Hi-C-tutorial.md) - * [Trio mode](workflow/doc/Assembly-Mode/Trio-tutorial.md) - * [Outputs](workflow/documentation.md#outputs) - * [Workflow output](workflow/doc/Outputs.md) - * [Optional data preparation](workflow/documentation.md#optional-data-preparation) - * [if your data is in a tarball archive](workflow/doc/Tar-data-preparation.md) - * [Going further](workflow/doc/Going-further.md) - * [Troubleshooting](workflow/documentation.md#known-errors) - * [known errors](workflow/doc/Known-errors.md) - * [Software Dependencies](workflow/documentation.md#programs) - * [Programs listing](workflow/doc/Programs.md) -* [Gitlab pages using honkit](honkit.md) + ## Repo directory structure ``` ├── README.md ├── job.sh -├── prejob.sh +├── local_run.sh +├── doc ├── workflow -│ ├── rules │ ├── scripts -│ ├── pre-job_snakefiles | └── Snakefile └── .config ├── snakemake_profile - | └── slurm - | ├── cluster_config.yml - | ├── config.yaml - | ├── CookieCutter.py - | ├── settings.json - | ├── slurm_utils.py - | ├── slurm-jobscript.sh - | ├── slurm-status.py - | └── slurm-submit.py └── masterconfig.yaml ``` -## Requirements -- snakemake >= 6.5.1 -- singularity +## Requirement +Miniforge, Singularity/Apptainer + +## How to Use +### 1. Set up +Clone the Git repository +```bash +git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git && cd GenomAsm4pg +``` + +Install Miniforge and create a virtual environement : +```bash +conda create -n wf_env -c conda-forge -c bioconda snakemake=8.4.7 snakemake-executor-plugin-slurm +``` +> Use Miniforge with the conda-forge chanel, see why [here](https://science-ouverte.inrae.fr/fr/offre-service/fiches-pratiques-et-recommandations/quelles-alternatives-aux-fonctionnalites-payantes-danaconda) (french) + +### 2. Configure the pipeline +- Edit the `masterconfig` file in the `.config/` directory with your sample information. + +### 3. Run the workflow +#### On a HPC +- Edit `job.sh` with your email and add path to the needed modules (`Singularity/Apptainer`, `Miniforge`) +- Provide the needed conda environement in `job.sh`, under `source activate wf_env` +- Run the workflow : +```bash +sbatch job.sh dry # Check for warnings +sbatch job.sh run # Then +``` +> **Nb 1:** If the your account name cant be automaticly determined, add it in the `.config/snakemake/profiles/slurm/config.yaml` file. +#### Localy +- Activate the environement `source activate wf_env` +- Run the workflow : +```bash +./local_run dry # Check for warnings +./local_run job.sh run # Then +``` -## How to run the workflow -[wiki](https://forgemia.inra.fr/asm4pg/GenomAsm4pg/-/wikis/home) +## Using the full potential of the workflow : +Asm4pg as many options, if you wish to modify the default values and now more about the workflow, please refer to the [documentation](doc/documentation.md) -## How to cite asm4pg? ## +## How to cite asm4pg? We are currently writing a publication about asm4pg. Meanwhile, if you use the pipeline, please cite it using the address of this repository. -## License ## - +## License The content of this repository is licensed under <A HREF="https://choosealicense.com/licenses/gpl-3.0/">(GNU GPLv3)</A> -## Contacts ## +## Contacts For any troubleshouting, issue or feature suggestion, please use the issue tab of this repository. For any other question or if you want to help in developing asm4pg, please contact Ludovic Duvaux at ludovic.duvaux@inrae.fr diff --git a/doc/Going-further.md b/doc/Going-further.md index c508e49..fb3f321 100644 --- a/doc/Going-further.md +++ b/doc/Going-further.md @@ -1,7 +1,5 @@ # Going further -[TOC] - ## 01. In-depth options ### Job.sh options @@ -64,22 +62,5 @@ IDS: ["toy_dataset", "toy_dataset_trio"] ``` Running the workflow with this config will assemble only `toy_dataset` and `toy_dataset_trio`. - -## 3. Optional fastq and bam files -If fastq and bam are available and you want to do raw QC with fastQC and longQC, add the `fastq` and/or `bam` key in your config. The fasta, fastq and bam filenames have to be the same. For example: - -```yaml -IDS: ["toy_dataset"] - -toy_dataset: - fasta: "./GenomAsm4pg/tutorial_data/toy_dataset.fasta" - fastq: "./GenomAsm4pg/tutorial_data/toy_dataset.fastq" - bam: "./GenomAsm4pg/tutorial_data/toy_dataset.bam" - run: tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: default -``` - ## 4. Add a reference genome You can add a reference genome to the `.masterconfig` and set `scafold_output` to True to run ragtag on your output. \ No newline at end of file diff --git a/doc/Quick-start.md b/doc/Quick-start.md deleted file mode 100644 index 548b5c2..0000000 --- a/doc/Quick-start.md +++ /dev/null @@ -1,68 +0,0 @@ -# Quick start -This tutorial shows how to use the workflow with default assembly mode which takes PacBio Hifi data as input. - -[TOC] - -## Clone repository -```bash -git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git -``` -Clone the repository in your desired folder. -## 1. Set up the scripts -### In job.sh -```bash -cd GenomAsm4pg/ -vim job.sh -``` -Modify: -- Line 17: Set your email address. -- Line 53: Set the path to your dataset folder. -### In masterconfig.yaml -```bash -vim .config/masterconfig.yaml -``` -Modify -- Line 2: Set the path to your output folder. -```yaml -# absolute path to your desired output path -root: ./GenomAsm4pg/<your_output_folder> -``` -Modify -- Line 18: Add all your raw datasets in IDS. -- Line 20: Provide the parameters for all datasets. -```yaml -####################### job - workflow ####################### -### CONFIG -IDS: ["toy_dataset"] - -toy_dataset: - fasta: "./GenomAsm4pg/tutorial_data/toy_dataset.fasta" - run: tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: default -``` -## 2. Addapt the scripts to your HPC -```bash -vim .config/snakemake_profile/slurm/cluster_config.yml -``` -The current profile is configured for SLURM. If you use SLURM, change line 13 to your email address. - -To run this workflow on another HPC, create a new profile (https://github.com/Snakemake-Profiles) and add it to the .config/snakemake_profile directory. Update the CLUSTER_CONFIG and PROFILE variables in the job.sh and prejob.sh scripts. - -If your cluster doesn’t have Singularity enabled by default, add it to the list of modules to load in job.sh. - -## 3. Dry run -To check the configuration, first perform a dry run of the workflow: -```bash -sbatch job.sh dry -``` -You can consult the logs in the slurm_logs/ directory. -## 4. Run -If the dry run is successful, ensure that the SNG_BIND variable in job.sh matches the root variable in masterconfig.yaml. -Then, run the script: -```bash -sbatch job.sh -``` -## Other assembly modes -If you want to use additional Hi-C data or parental data, follow the [Hi-C assembly mode tutorial](Assembly-Mode/Hi-C-tutorial.md) or the [Trio assembly mode tutorial](Assembly-Mode/Trio-tutorial.md). To go further with the workflow use go [here](Going-further.md). \ No newline at end of file diff --git a/doc/Tar-data-preparation.md b/doc/Tar-data-preparation.md deleted file mode 100644 index cc75afd..0000000 --- a/doc/Tar-data-preparation.md +++ /dev/null @@ -1,34 +0,0 @@ -# Optional: data preparation - -If your data is in a tarball, this companion workflow will extract the data and convert bam files to fastq and fasta if necessary. - -[TOC] - -## 1. Config file -```bash -cd GenomAsm4pg/.config -``` -Modify the the `data` variable in file `.config/masterconfig.yaml` to be the path to the directory containing all input tar files. -This workflow can automatically determine the name of files in the specified `data` directory, or run only on given files : -- `get_all_tar_filename: True` will uncompress all tar files. If you want to choose the the files to uncompress, use `get_all_tar_filename: False` and give the filenames as a list in `tarIDS` - -## 2. Run -Modify the `SNG_BIND` variable in `prejob.sh`, it has to be the same as the variable `root` in `.config/masterconfig.yaml`. Change line 17 to your email adress. -If Singularity is not in the HPC environement, add `module load singularity` under Module loading. - -Then run - -```bash -sbatch prejob.sh -``` - -## 3. Outputs -This will create multiple directories to prepare the data for the workflow. You will end up with a `bam_files` directory containing all *bam* files, renamed as the tar filename if your data was named "ccs.bam", and a `fastx_files` directory containing all *fasta* and *fastq* files. The `extract` directory contains all other files that were in the tar ball. - -``` -workflow_results -└── 00_raw_data - ├── bam_files - ├── extract - └── fastx_files -``` diff --git a/doc/dag.svg b/doc/dag.svg new file mode 100644 index 0000000..e176c29 --- /dev/null +++ b/doc/dag.svg @@ -0,0 +1,215 @@ +<?xml version="1.0" standalone="no"?> +<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="676pt" height="404pt" viewBox="0.00 0.00 676.44 404.00"> +<g id="graph0" class="graph" transform="translate(4,400) scale(1)" data-name="snakemake_dag"> + +<polygon fill="white" stroke="none" points="-4,4 -4,-400 672.44,-400 672.44,4 -4,4" style=""/> +<!-- 0 --> +<g id="node1" class="node" pointer-events="visible" data-name="0"> + +<path fill="none" stroke="#d88d56" stroke-width="2" d="M350.51,-36C350.51,-36 320.51,-36 320.51,-36 314.51,-36 308.51,-30 308.51,-24 308.51,-24 308.51,-12 308.51,-12 308.51,-6 314.51,0 320.51,0 320.51,0 350.51,0 350.51,0 356.51,0 362.51,-6 362.51,-12 362.51,-12 362.51,-24 362.51,-24 362.51,-30 356.51,-36 350.51,-36" style=""/> +<text text-anchor="middle" x="335.51" y="-15" font-family="sans" font-size="10.00" style="">all</text> +</g> +<!-- 1 --> +<g id="node2" class="node" pointer-events="visible" data-name="1"> + +<path fill="none" stroke="#b6d856" stroke-width="2" d="M207.67,-252C207.67,-252 139.35,-252 139.35,-252 133.35,-252 127.35,-246 127.35,-240 127.35,-240 127.35,-228 127.35,-228 127.35,-222 133.35,-216 139.35,-216 139.35,-216 207.67,-216 207.67,-216 213.67,-216 219.67,-222 219.67,-228 219.67,-228 219.67,-240 219.67,-240 219.67,-246 213.67,-252 207.67,-252" style=""/> +<text text-anchor="middle" x="173.51" y="-237" font-family="sans" font-size="10.00" style="">haplotigs_handling</text> +<text text-anchor="middle" x="173.51" y="-225" font-family="sans" font-size="10.00" style="">n: 1</text> +</g> +<!-- 1->0 --> +<g id="edge1" class="edge" data-name="1->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M131.29,-215.11C69.2,-186.36 -34.57,-127.44 12.51,-72 48.33,-29.83 216.55,-21.15 295.02,-19.41" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="294.73,-22.91 304.67,-19.23 294.6,-15.91 294.73,-22.91" style=""/> +</g> +<!-- 5 --> +<g id="node6" class="node" pointer-events="visible" data-name="5"> + +<path fill="none" stroke="#61d856" stroke-width="2" d="M81.05,-108C81.05,-108 33.97,-108 33.97,-108 27.97,-108 21.97,-102 21.97,-96 21.97,-96 21.97,-84 21.97,-84 21.97,-78 27.97,-72 33.97,-72 33.97,-72 81.05,-72 81.05,-72 87.05,-72 93.05,-78 93.05,-84 93.05,-84 93.05,-96 93.05,-96 93.05,-102 87.05,-108 81.05,-108" style=""/> +<text text-anchor="middle" x="57.51" y="-87" font-family="sans" font-size="10.00" style="">cutoffs_graph</text> +</g> +<!-- 1->5 --> +<g id="edge13" class="edge" data-name="1->5"> + +<path fill="none" stroke="grey" stroke-width="2" d="M158.82,-215.02C138.94,-190.69 103.31,-147.07 80.04,-118.58" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="83.03,-116.71 74,-111.18 77.61,-121.14 83.03,-116.71" style=""/> +</g> +<!-- 7 --> +<g id="node8" class="node" pointer-events="visible" data-name="7"> + +<path fill="none" stroke="#56c1d8" stroke-width="2" d="M223.83,-108C223.83,-108 123.19,-108 123.19,-108 117.19,-108 111.19,-102 111.19,-96 111.19,-96 111.19,-84 111.19,-84 111.19,-78 117.19,-72 123.19,-72 123.19,-72 223.83,-72 223.83,-72 229.83,-72 235.83,-78 235.83,-84 235.83,-84 235.83,-96 235.83,-96 235.83,-102 229.83,-108 223.83,-108" style=""/> +<text text-anchor="middle" x="173.51" y="-87" font-family="sans" font-size="10.00" style="">genometools_on_assembly</text> +</g> +<!-- 1->7 --> +<g id="edge15" class="edge" data-name="1->7"> + +<path fill="none" stroke="grey" stroke-width="2" d="M173.51,-215.02C173.51,-191.54 173.51,-150.11 173.51,-121.64" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="177.01,-121.87 173.51,-111.87 170.01,-121.87 177.01,-121.87" style=""/> +</g> +<!-- 10 --> +<g id="node11" class="node" pointer-events="visible" data-name="10"> + +<path fill="none" stroke="#ced856" stroke-width="2" d="M282.38,-180C282.38,-180 226.64,-180 226.64,-180 220.64,-180 214.64,-174 214.64,-168 214.64,-168 214.64,-156 214.64,-156 214.64,-150 220.64,-144 226.64,-144 226.64,-144 282.38,-144 282.38,-144 288.38,-144 294.38,-150 294.38,-156 294.38,-156 294.38,-168 294.38,-168 294.38,-174 288.38,-180 282.38,-180" style=""/> +<text text-anchor="middle" x="254.51" y="-159" font-family="sans" font-size="10.00" style="">unpigz_to_fasta</text> +</g> +<!-- 1->10 --> +<g id="edge18" class="edge" data-name="1->10"> + +<path fill="none" stroke="grey" stroke-width="2" d="M193.95,-215.34C203.09,-207.44 214.05,-197.96 224.12,-189.26" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="226.23,-192.07 231.5,-182.88 221.65,-186.77 226.23,-192.07" style=""/> +</g> +<!-- 2 --> +<g id="node3" class="node" pointer-events="visible" data-name="2"> + +<path fill="none" stroke="#56d8d0" stroke-width="2" d="M305.15,-324C305.15,-324 241.87,-324 241.87,-324 235.87,-324 229.87,-318 229.87,-312 229.87,-312 229.87,-300 229.87,-300 229.87,-294 235.87,-288 241.87,-288 241.87,-288 305.15,-288 305.15,-288 311.15,-288 317.15,-294 317.15,-300 317.15,-300 317.15,-312 317.15,-312 317.15,-318 311.15,-324 305.15,-324" style=""/> +<text text-anchor="middle" x="273.51" y="-303" font-family="sans" font-size="10.00" style="">pigz_gfa_to_fasta</text> +</g> +<!-- 2->1 --> +<g id="edge10" class="edge" data-name="2->1"> + +<path fill="none" stroke="grey" stroke-width="2" d="M248.02,-287.15C236.26,-278.93 222.11,-269.02 209.31,-260.06" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="211.63,-257.42 201.43,-254.55 207.62,-263.15 211.63,-257.42" style=""/> +</g> +<!-- 4 --> +<g id="node5" class="node" pointer-events="visible" data-name="4"> + +<path fill="none" stroke="#b6d856" stroke-width="2" d="M426.67,-252C426.67,-252 358.35,-252 358.35,-252 352.35,-252 346.35,-246 346.35,-240 346.35,-240 346.35,-228 346.35,-228 346.35,-222 352.35,-216 358.35,-216 358.35,-216 426.67,-216 426.67,-216 432.67,-216 438.67,-222 438.67,-228 438.67,-228 438.67,-240 438.67,-240 438.67,-246 432.67,-252 426.67,-252" style=""/> +<text text-anchor="middle" x="392.51" y="-237" font-family="sans" font-size="10.00" style="">haplotigs_handling</text> +<text text-anchor="middle" x="392.51" y="-225" font-family="sans" font-size="10.00" style="">n: 2</text> +</g> +<!-- 2->4 --> +<g id="edge12" class="edge" data-name="2->4"> + +<path fill="none" stroke="grey" stroke-width="2" d="M303.84,-287.15C318.25,-278.68 335.69,-268.42 351.28,-259.25" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="352.77,-262.43 359.62,-254.35 349.22,-256.4 352.77,-262.43" style=""/> +</g> +<!-- 3 --> +<g id="node4" class="node" pointer-events="visible" data-name="3"> + +<path fill="none" stroke="#d8c356" stroke-width="2" d="M295.44,-396C295.44,-396 251.57,-396 251.57,-396 245.57,-396 239.57,-390 239.57,-384 239.57,-384 239.57,-372 239.57,-372 239.57,-366 245.57,-360 251.57,-360 251.57,-360 295.44,-360 295.44,-360 301.44,-360 307.44,-366 307.44,-372 307.44,-372 307.44,-384 307.44,-384 307.44,-390 301.44,-396 295.44,-396" style=""/> +<text text-anchor="middle" x="273.51" y="-381" font-family="sans" font-size="10.00" style="">hifiasm</text> +<text text-anchor="middle" x="273.51" y="-369" font-family="sans" font-size="10.00" style="">sample: run1</text> +</g> +<!-- 3->2 --> +<g id="edge11" class="edge" data-name="3->2"> + +<path fill="none" stroke="grey" stroke-width="2" d="M273.51,-359.34C273.51,-352.75 273.51,-345.08 273.51,-337.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="277.01,-337.93 273.51,-327.93 270.01,-337.93 277.01,-337.93" style=""/> +</g> +<!-- 4->0 --> +<g id="edge2" class="edge" data-name="4->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M372.55,-215.37C363.24,-205.88 353,-193.4 347.51,-180 330.01,-137.34 330.05,-83.03 332.31,-49.65" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="335.79,-49.99 333.12,-39.74 328.81,-49.42 335.79,-49.99" style=""/> +</g> +<!-- 6 --> +<g id="node7" class="node" pointer-events="visible" data-name="6"> + +<path fill="none" stroke="#61d856" stroke-width="2" d="M416.05,-180C416.05,-180 368.97,-180 368.97,-180 362.97,-180 356.97,-174 356.97,-168 356.97,-168 356.97,-156 356.97,-156 356.97,-150 362.97,-144 368.97,-144 368.97,-144 416.05,-144 416.05,-144 422.05,-144 428.05,-150 428.05,-156 428.05,-156 428.05,-168 428.05,-168 428.05,-174 422.05,-180 416.05,-180" style=""/> +<text text-anchor="middle" x="392.51" y="-159" font-family="sans" font-size="10.00" style="">cutoffs_graph</text> +</g> +<!-- 4->6 --> +<g id="edge14" class="edge" data-name="4->6"> + +<path fill="none" stroke="grey" stroke-width="2" d="M392.51,-215.34C392.51,-208.75 392.51,-201.08 392.51,-193.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="396.01,-193.93 392.51,-183.93 389.01,-193.93 396.01,-193.93" style=""/> +</g> +<!-- 8 --> +<g id="node9" class="node" pointer-events="visible" data-name="8"> + +<path fill="none" stroke="#56c1d8" stroke-width="2" d="M558.83,-180C558.83,-180 458.19,-180 458.19,-180 452.19,-180 446.19,-174 446.19,-168 446.19,-168 446.19,-156 446.19,-156 446.19,-150 452.19,-144 458.19,-144 458.19,-144 558.83,-144 558.83,-144 564.83,-144 570.83,-150 570.83,-156 570.83,-156 570.83,-168 570.83,-168 570.83,-174 564.83,-180 558.83,-180" style=""/> +<text text-anchor="middle" x="508.51" y="-159" font-family="sans" font-size="10.00" style="">genometools_on_assembly</text> +</g> +<!-- 4->8 --> +<g id="edge16" class="edge" data-name="4->8"> + +<path fill="none" stroke="grey" stroke-width="2" d="M422.08,-215.15C435.99,-206.76 452.79,-196.62 467.87,-187.52" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="469.64,-190.54 476.4,-182.38 466.03,-184.55 469.64,-190.54" style=""/> +</g> +<!-- 12 --> +<g id="node13" class="node" pointer-events="visible" data-name="12"> + +<path fill="none" stroke="#ced856" stroke-width="2" d="M656.38,-180C656.38,-180 600.64,-180 600.64,-180 594.64,-180 588.64,-174 588.64,-168 588.64,-168 588.64,-156 588.64,-156 588.64,-150 594.64,-144 600.64,-144 600.64,-144 656.38,-144 656.38,-144 662.38,-144 668.38,-150 668.38,-156 668.38,-156 668.38,-168 668.38,-168 668.38,-174 662.38,-180 656.38,-180" style=""/> +<text text-anchor="middle" x="628.51" y="-159" font-family="sans" font-size="10.00" style="">unpigz_to_fasta</text> +</g> +<!-- 4->12 --> +<g id="edge20" class="edge" data-name="4->12"> + +<path fill="none" stroke="grey" stroke-width="2" d="M439.61,-220.46C474.74,-210.93 524.18,-197.14 575.33,-181.23" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="576.28,-184.61 584.77,-178.27 574.18,-177.93 576.28,-184.61" style=""/> +</g> +<!-- 5->0 --> +<g id="edge3" class="edge" data-name="5->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M93.71,-74.83C96.67,-73.83 99.63,-72.87 102.51,-72 168.99,-51.98 248.14,-35.49 294.97,-26.46" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="295.43,-29.94 304.6,-24.63 294.12,-23.06 295.43,-29.94" style=""/> +</g> +<!-- 6->0 --> +<g id="edge4" class="edge" data-name="6->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M385.29,-143.02C375.78,-119.33 358.93,-77.35 347.5,-48.86" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="350.77,-47.62 343.8,-39.65 344.27,-50.23 350.77,-47.62" style=""/> +</g> +<!-- 7->0 --> +<g id="edge5" class="edge" data-name="7->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M214.81,-71.15C239.67,-60.41 271.16,-46.81 295.74,-36.18" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="296.97,-39.47 304.76,-32.29 294.19,-33.04 296.97,-39.47" style=""/> +</g> +<!-- 8->0 --> +<g id="edge6" class="edge" data-name="8->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M479.96,-143.14C464.85,-133.31 446.23,-120.58 430.51,-108 406.16,-88.52 380.47,-64.19 361.95,-45.89" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="364.66,-43.65 355.11,-39.07 359.72,-48.61 364.66,-43.65" style=""/> +</g> +<!-- 9 --> +<g id="node10" class="node" pointer-events="visible" data-name="9"> + +<path fill="none" stroke="#56d8b9" stroke-width="2" d="M295.51,-108C295.51,-108 265.51,-108 265.51,-108 259.51,-108 253.51,-102 253.51,-96 253.51,-96 253.51,-84 253.51,-84 253.51,-78 259.51,-72 265.51,-72 265.51,-72 295.51,-72 295.51,-72 301.51,-72 307.51,-78 307.51,-84 307.51,-84 307.51,-96 307.51,-96 307.51,-102 301.51,-108 295.51,-108" style=""/> +<text text-anchor="middle" x="280.51" y="-87" font-family="sans" font-size="10.00" style="">busco</text> +</g> +<!-- 9->0 --> +<g id="edge7" class="edge" data-name="9->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M294.39,-71.34C300.2,-63.93 307.11,-55.14 313.59,-46.9" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="316.15,-49.31 319.57,-39.28 310.64,-44.98 316.15,-49.31" style=""/> +</g> +<!-- 10->9 --> +<g id="edge17" class="edge" data-name="10->9"> + +<path fill="none" stroke="grey" stroke-width="2" d="M261.07,-143.34C263.57,-136.59 266.51,-128.69 269.32,-121.11" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="272.6,-122.33 272.8,-111.74 266.04,-119.9 272.6,-122.33" style=""/> +</g> +<!-- 11 --> +<g id="node12" class="node" pointer-events="visible" data-name="11"> + +<path fill="none" stroke="#56d8b9" stroke-width="2" d="M481.51,-108C481.51,-108 451.51,-108 451.51,-108 445.51,-108 439.51,-102 439.51,-96 439.51,-96 439.51,-84 439.51,-84 439.51,-78 445.51,-72 451.51,-72 451.51,-72 481.51,-72 481.51,-72 487.51,-72 493.51,-78 493.51,-84 493.51,-84 493.51,-96 493.51,-96 493.51,-102 487.51,-108 481.51,-108" style=""/> +<text text-anchor="middle" x="466.51" y="-87" font-family="sans" font-size="10.00" style="">busco</text> +</g> +<!-- 11->0 --> +<g id="edge8" class="edge" data-name="11->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M438.73,-74.15C420.06,-64.18 395.18,-50.89 374.53,-39.85" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="376.39,-36.88 365.92,-35.25 373.09,-43.05 376.39,-36.88" style=""/> +</g> +<!-- 12->11 --> +<g id="edge19" class="edge" data-name="12->11"> + +<path fill="none" stroke="grey" stroke-width="2" d="M587.74,-144.37C565.69,-135.26 537.74,-123.54 506.16,-109.55" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="507.69,-106.4 497.13,-105.53 504.84,-112.8 507.69,-106.4" style=""/> +</g> +<!-- 13 --> +<g id="node14" class="node" pointer-events="visible" data-name="13"> + +<path fill="none" stroke="#d8ac56" stroke-width="2" d="M623.26,-108C623.26,-108 523.76,-108 523.76,-108 517.76,-108 511.76,-102 511.76,-96 511.76,-96 511.76,-84 511.76,-84 511.76,-78 517.76,-72 523.76,-72 523.76,-72 623.26,-72 623.26,-72 629.26,-72 635.26,-78 635.26,-84 635.26,-84 635.26,-96 635.26,-96 635.26,-102 629.26,-108 623.26,-108" style=""/> +<text text-anchor="middle" x="573.51" y="-93" font-family="sans" font-size="10.00" style="">genometools_on_raw_data</text> +<text text-anchor="middle" x="573.51" y="-81" font-family="sans" font-size="10.00" style="">sample: run1</text> +</g> +<!-- 13->0 --> +<g id="edge9" class="edge" data-name="13->0"> + +<path fill="none" stroke="grey" stroke-width="2" d="M512.53,-71.06C469.53,-58.42 413.15,-41.84 375.7,-30.82" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="376.78,-27.49 366.2,-28.03 374.8,-34.21 376.78,-27.49" style=""/> +</g> +</g> +</svg> \ No newline at end of file -- GitLab From 1455d7fc86fe3a0702689228210c2f18c2c08c86 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 15:51:58 +0100 Subject: [PATCH 101/178] Update variables to match the rest of the wf --- workflow/scripts/report.Rmd | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/workflow/scripts/report.Rmd b/workflow/scripts/report.Rmd index b26f047..78e48c3 100644 --- a/workflow/scripts/report.Rmd +++ b/workflow/scripts/report.Rmd @@ -38,7 +38,7 @@ output: ### Reads statistics ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["gt_reads"]]), sep = '\n') +cat(readLines(snakemake@input[["genometools_on_raw_data"]]), sep = '\n') ``` @@ -46,39 +46,39 @@ cat(readLines(snakemake@input[["gt_reads"]]), sep = '\n') ### Assembly statistics #### Hap 1 ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["gt_asm_1"]]), sep = '\n') +cat(readLines(snakemake@input[["genometools_hap1"]]), sep = '\n') ``` #### Hap 2 ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["gt_asm_2"]]), sep = '\n') +cat(readLines(snakemake@input[["genometools_hap2"]]), sep = '\n') ``` ### K-mer profiles | Hap 1 | Hap 2 | |-------|-------| -|  |  | +|  |  | ### K-mer completeness and error rate Completeness ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["merq_comp"]]), sep = '\n') +cat(readLines(snakemake@input[["merqury_stats"]]), sep = '\n') ``` Error rate ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["merq_err"]]), sep = '\n') +cat(readLines(snakemake@input[["merqury_qv"]]), sep = '\n') ``` ### BUSCO #### Hap 1 ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["busco_1"]]), sep = '\n') +cat(readLines(snakemake@input[["busco_hap1"]]), sep = '\n') ``` #### Hap 2 ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["busco_2"]]), sep = '\n') +cat(readLines(snakemake@input[["busco_hap2"]]), sep = '\n') ``` ### Telomeres @@ -86,29 +86,29 @@ Telomeres present in assembly #### Hap 1 ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["tel_1"]]), sep = '\n') +cat(readLines(snakemake@input[["telomeres_hap1"]]), sep = '\n') ``` #### Hap 2 ```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["tel_2"]]), sep = '\n') +cat(readLines(snakemake@input[["telomeres_hap2"]]), sep = '\n') ``` ### LTR Assembly Index (LAI) #### Hap 1 LTR recap ```{r comment='', echo=FALSE} -cat(head(readLines(snakemake@input[["LRT_recap_1"]]), 50), sep = '\n') +cat(head(readLines(snakemake@input[["LRT_recap_hap1"]]), 50), sep = '\n') ``` LAI ```{r comment='', echo=FALSE} -cat(head(readLines(snakemake@input[["LAI_1"]]), 2), sep = '\n') +cat(head(readLines(snakemake@input[["LAI_hap1"]]), 2), sep = '\n') ``` #### Hap 2 LTR recap ```{r comment='', echo=FALSE} -cat(head(readLines(snakemake@input[["LRT_recap_2"]]), 50), sep = '\n') +cat(head(readLines(snakemake@input[["LRT_recap_hap2"]]), 50), sep = '\n') ``` LAI ```{r comment='', echo=FALSE} -cat(head(readLines(snakemake@input[["LAI_2"]]), 2), sep = '\n') +cat(head(readLines(snakemake@input[["LAI_hap2"]]), 2), sep = '\n') ``` \ No newline at end of file -- GitLab From 39ff1259855adecb479246d6218fb9d0562a726a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 15:52:08 +0100 Subject: [PATCH 102/178] ignore results --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index edcfc05..0926921 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,5 @@ node_modules/* !workflow/* .snakemake busco_downloads -.cache \ No newline at end of file +.cache +results \ No newline at end of file -- GitLab From ed931a34d5e0812bda0e63e2594cc36d499b04ef Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 15:52:26 +0100 Subject: [PATCH 103/178] Update quick install guide --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 1d3792b..4c6908d 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ This workflow uses [Snakemake](https://snakemake.readthedocs.io/en/stable/) to q ``` ## Requirement -Miniforge, Singularity/Apptainer +Miniforge, Singularity/Apptainer, Snakemake ## How to Use ### 1. Set up @@ -32,27 +32,27 @@ Clone the Git repository git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git && cd GenomAsm4pg ``` -Install Miniforge and create a virtual environement : +### 2. Configure the pipeline +- Edit the `masterconfig` file in the `.config/` directory with your sample information. + +### 3. Run the workflow +#### A. On a HPC +Create a virtual environement for the workflow to run into : ```bash conda create -n wf_env -c conda-forge -c bioconda snakemake=8.4.7 snakemake-executor-plugin-slurm ``` > Use Miniforge with the conda-forge chanel, see why [here](https://science-ouverte.inrae.fr/fr/offre-service/fiches-pratiques-et-recommandations/quelles-alternatives-aux-fonctionnalites-payantes-danaconda) (french) -### 2. Configure the pipeline -- Edit the `masterconfig` file in the `.config/` directory with your sample information. - -### 3. Run the workflow -#### On a HPC - Edit `job.sh` with your email and add path to the needed modules (`Singularity/Apptainer`, `Miniforge`) -- Provide the needed conda environement in `job.sh`, under `source activate wf_env` +- Provide the environement you created in `job.sh`, under `source activate wf_env` - Run the workflow : ```bash sbatch job.sh dry # Check for warnings sbatch job.sh run # Then ``` > **Nb 1:** If the your account name cant be automaticly determined, add it in the `.config/snakemake/profiles/slurm/config.yaml` file. -#### Localy -- Activate the environement `source activate wf_env` +#### B. Localy +- Make sure you have Snakemake and Singularity/Apptainer instaled - Run the workflow : ```bash ./local_run dry # Check for warnings -- GitLab From c71ded2acc6589a2662c9b7f69b745b400a5c1b0 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 15:52:49 +0100 Subject: [PATCH 104/178] Fix haplotype number typo --- workflow/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 2b93a46..7641a16 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -369,8 +369,8 @@ rule generate_report: genometools_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), genometools_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), - busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "short_summary.specific.{sample}_hap1.txt"), - busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "short_summary.specific.{sample}_hap2.txt"), + busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "short_summary.specific.{sample}_hap1.txt"), + busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "short_summary.specific.{sample}_hap2.txt"), kplot_hap1 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), kplot_hap2 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), -- GitLab From 2995b7efb2b7435c7ad9c43e7710b92542dca970 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 9 Jan 2025 16:49:57 +0100 Subject: [PATCH 105/178] Add ragtag draft --- workflow/Snakefile | 52 ++++++++++++++++++++++----------- workflow/scripts/ragtag_call.sh | 30 +++++++++++++++++++ 2 files changed, 65 insertions(+), 17 deletions(-) create mode 100644 workflow/scripts/ragtag_call.sh diff --git a/workflow/Snakefile b/workflow/Snakefile index 7641a16..39f8097 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -95,7 +95,7 @@ rule haplotigs_handling: cutoffs = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs") params: prefix = "{sample}_hap{n}", - dirr = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}"), + out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}"), purge_dups_option = get_purge_bool threads: 20 resources: @@ -105,7 +105,7 @@ rule haplotigs_handling: f"{container_registry}/purge_dups1.2.5" shell: """ - ./workflow/scripts/haplotigs_handling.sh {params.purge_dups_option} {input.hap_fasta} {output.hap} {params.prefix} {input.reads} {params.dirr} + ./workflow/scripts/haplotigs_handling.sh {params.purge_dups_option} {input.hap_fasta} {output.hap} {params.prefix} {input.reads} {params.out_dir} """ # Rule to unsip the haplotypes if needed for other rules @@ -131,7 +131,7 @@ rule cutoffs_graph: output: graph = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs_graph_hap{n}.png") params: - dirr = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}") + out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}") threads: 1 resources: mem_mb=10000, @@ -139,7 +139,7 @@ rule cutoffs_graph: container: f"{container_registry}/matplotlib0.11.5" shell: - "python3 workflow/scripts/hist_plot.py -c {input} {params.dirr}/PB.stat {output.graph}" + "python3 workflow/scripts/hist_plot.py -c {input} {params.out_dir}/PB.stat {output.graph}" # Produce basic stats for the reads files rule genometools_on_raw_data: @@ -234,14 +234,14 @@ rule genomescope: params: ploidy = get_ploidy, km_size = get_kmer_size, - dirr = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope"), + out_dir = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope"), resources: mem_mb=40000, time="10:00:00" container: f"{container_registry}/genomescope2.0" shell: - "genomescope.R -k {params.km_size} -i {input} -o {params.dirr} -p {params.ploidy}" + "genomescope.R -k {params.km_size} -i {input} -o {params.out_dir} -p {params.ploidy}" # NOT TESTED # Compare the k-mer content between the assembly and the raw reads. @@ -252,7 +252,7 @@ rule kat: output: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}.katplot.png") params: - dirr = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}"), + out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}"), km_size = get_kmer_size threads: 4 resources: @@ -262,8 +262,8 @@ rule kat: f"{container_registry}/kat2.4.1" shell: """ - kat comp -o {params.dirr} -t {threads} -m {params.km_size}--output_type png -v {input.jellyfish} {input.hap} && - kat plot spectra-cn -x 200 -o {params.dirr}.katplot.png {params.path}-main.mx + kat comp -o {params.out_dir} -t {threads} -m {params.km_size}--output_type png -v {input.jellyfish} {input.hap} && + kat plot spectra-cn -x 200 -o {params.out_dir}.katplot.png {params.path}-main.mx """ # NOT TESTED @@ -296,7 +296,7 @@ rule merqury: stats = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats") params: prefix = "{sample}_merqury", - dirr = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury") + out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury") threads: 20 resources: mem_mb=60000, @@ -306,13 +306,13 @@ rule merqury: shell: """ # Create temporary copies of haplotypes - cp {input.hap1} {params.dirr}/tmp_hap1.fasta.gz - cp {input.hap2} {params.dirr}/tmp_hap2.fasta.gz + cp {input.hap1} {params.out_dir}/tmp_hap1.fasta.gz + cp {input.hap2} {params.out_dir}/tmp_hap2.fasta.gz # Run Merqury - cd {params.dirr} && \ + cd {params.out_dir} && \ export MERQURY=/usr/local/share/merqury && \ - merqury.sh {input.read_db} {params.dirr}/tmp_hap1.fasta.gz {params.dirr}/tmp_hap1.fasta.gz {params.prefix} + merqury.sh {input.read_db} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap1.fasta.gz {params.prefix} """ # NOT TESTED @@ -341,7 +341,7 @@ rule LTR_retriever: recap = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl") params: prefix="{sample}_hap{n}", - dirr = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR") + out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR") threads: 20 resources: mem_mb=250000, @@ -351,7 +351,7 @@ rule LTR_retriever: shell: ''' export PATH="/opt/LTR_retriever:$PATH" && - cd {params.dirr} && + cd {params.out_dir} && LTR_retriever -threads {threads} -genome {input.hap} -infinder {input.scn} && mv {params.prefix}.fa.out.LAI {output.lai} && mv {params.prefix}.fa.tbl {output.recap} && @@ -413,4 +413,22 @@ rule relocate_report: output: os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html"), shell: - "mv {input} {output}" \ No newline at end of file + "mv {input} {output}" + +rule ragtag: + input: + rules.unpigz_to_fasta.output + output: + dir(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold")) + params: + out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold") + threads: 4 + resources: + mem_mb = 80000, + time = "10:00:00" + container: + f"{container_registry}/ragtag:2.0.1" + shell: + """ + ./workflow/scripts/ragtag_call.sh + """ diff --git a/workflow/scripts/ragtag_call.sh b/workflow/scripts/ragtag_call.sh new file mode 100644 index 0000000..accb3b6 --- /dev/null +++ b/workflow/scripts/ragtag_call.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Script to dynamically run ragtag or not +# Author: Lucien PIAT +# For: Project Pangenoak +# Date: January 6, 2025 + +RAGTAG=$1 +HAP_IN=$2 +HAP_OUT=$3 +REF=$4 +DIRR=$6 + + +if [[ "$RAGTAG" == "True" || "$RAGTAG" == "true" ]]; then + + echo "Asm4pg -> Running ragtag" + mkdir -p $DIRR + ragtag.py scaffold -o {params.out_dir} -t {threads} {input.reference} {input.assembly} && + +else + echo "Asm4pg -> Ragtag option is off" + mkdir -p $DIRR + +fi + + + + + mv {params.out_dir}/ragtag.scaffold.fasta {output.scaffold} && + mv {params.out_dir}/ragtag.scaffold.paf {output.alignment} \ No newline at end of file -- GitLab From 393c35dc03220c135eefee5d280a7cd9448948ab Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 09:21:19 +0100 Subject: [PATCH 106/178] add functions to fecth reference genome --- workflow/scripts/parameter_retrieval.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py index a3963b0..eb37b05 100644 --- a/workflow/scripts/parameter_retrieval.py +++ b/workflow/scripts/parameter_retrieval.py @@ -2,7 +2,7 @@ from snakemake.io import expand # Used to retrive the parameters for rules # Fetch the purge level for hifiasm -def get_purge_force(wildcards): +def get_purge_force(wildcards) -> str: try : force = config["samples"][wildcards.sample]["assembly_purge_force"] except KeyError: @@ -11,7 +11,7 @@ def get_purge_force(wildcards): return force # Fetch the mode for hifiasm -def get_mode(wildcards): +def get_mode(wildcards) -> str: try : mode = config["samples"][wildcards.sample]["mode"] except KeyError: @@ -20,7 +20,7 @@ def get_mode(wildcards): return mode # Fetch r1/r2 fasta file for hi-c -def get_run(wildcards, run:int): +def get_run(wildcards, run:int) -> str: try : run= config["samples"][wildcards.sample][f"r{run}"] except KeyError: @@ -28,7 +28,7 @@ def get_run(wildcards, run:int): return run # Fetch the purge mode, return a boolean from config file -def get_purge_bool(wildcards): +def get_purge_bool(wildcards) -> bool: try : purge_bool = config["samples"][wildcards.sample]["run_purge_dups"] except KeyError: @@ -58,4 +58,19 @@ def get_kmer_size(wildcards) -> int: except KeyError: print('Asm4pg -> "kmer_size" unspecified for ' + wildcards.sample + ', using 21 by default') return 21 - return size \ No newline at end of file + return size + +def get_reference(wildcards) -> str: + try : + reference_genome = config["samples"][wildcards.sample]["reference_genome"] + except KeyError: + return 'None' + return reference_genome + +def get_ragtag_bool(wildcards) -> bool: + try : + ragtag_bool = config["samples"][wildcards.sample]["run_ragtag"] + except KeyError: + print('Asm4pg -> "run_ragtag" unspecified for ' + wildcards.sample + ', using "False" by default') + return False + return ragtag_bool \ No newline at end of file -- GitLab From 685abc61853e3df4ebe46f81d3a2e15c271c6628 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 09:21:29 +0100 Subject: [PATCH 107/178] ignore the dag --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0926921..660a0f9 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ node_modules/* .snakemake busco_downloads .cache -results \ No newline at end of file +results +dag.dot -- GitLab From 2af790108d2970662fd332f77db6801bec6ea49a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 09:21:57 +0100 Subject: [PATCH 108/178] Fix meryl and kat rules --- workflow/Snakefile | 59 +++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 39f8097..81deb12 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -23,19 +23,43 @@ rule all: os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "cutoffs_graph_hap{n}.png"), sample=config["samples"].keys(), n=[1, 2] ), + expand( + os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), + sample=config["samples"].keys() + ), + expand( + os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), + sample=config["samples"].keys() + ), + expand( + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.qv"), + sample=config["samples"].keys() + ), + expand( + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats"), + sample=config["samples"].keys() + ), expand( os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt"), sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "short_summary.specific.{sample}_hap{n}.txt"), + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "telomeres", "{sample}_hap{n}_telomeres.txt"), sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), - sample=config["samples"].keys() + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}.katplot.png"), + sample=config["samples"].keys(), n=[1, 2] + ), + expand( + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "{sample}_hap{n}.out.LAI"), + sample=config["samples"].keys(), n=[1, 2] + ), + expand( + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl"), + sample=config["samples"].keys(), n=[1, 2] ) - + # Genome assembly using hifiasm rule hifiasm: input: @@ -157,7 +181,6 @@ rule genometools_on_raw_data: shell: "gt seqstat {input} > {output}" -# NOT TESTED # Produce basic stats for each haplotypes assembled use rule genometools_on_raw_data as genometools_on_assembly with: input: @@ -186,7 +209,6 @@ rule busco: shell: "busco -f -i {input} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads}" -# NOT TESTED # Estimate telomeric region content rule find_telomeres: input: @@ -202,7 +224,6 @@ rule find_telomeres: shell: "python3 workflow/scripts/FindTelomeres.py {input} > {output}" -# NOT TESTED # Count k-mers in the input reads and generate a histogram of k-mer frequencies. rule jellyfish: input: @@ -220,11 +241,12 @@ rule jellyfish: f"{container_registry}/jellyfish2.3.0" shell: """ + echo "Aasm4pg -> starting jellyfish counts of kmers of size {params.km_size}" && jellyfish count -m {params.km_size} -s 100M -t {threads} -o {output.jf} -C <(zcat {input.reads}) && - jellyfish histo -h 1000000 -t {threads} {output.jf} > {output.histo} + echo "Aasm4pg -> starting to compute jellyfish histogram" + jellyfish histo -h 1000000 -t {threads} {output.jf} > {output.histo} """ -# NOT TESTED # Analyze the k-mer histogram and estimate genome characteristics, including genome size, heterozygosity, and error rates. rule genomescope: input: @@ -243,12 +265,11 @@ rule genomescope: shell: "genomescope.R -k {params.km_size} -i {input} -o {params.out_dir} -p {params.ploidy}" -# NOT TESTED # Compare the k-mer content between the assembly and the raw reads. rule kat: input: hap = rules.haplotigs_handling.output.hap, - jellyfish = rules.jellyfish.output.histo + jellyfish = rules.jellyfish.output.jf output: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}.katplot.png") params: @@ -262,17 +283,16 @@ rule kat: f"{container_registry}/kat2.4.1" shell: """ - kat comp -o {params.out_dir} -t {threads} -m {params.km_size}--output_type png -v {input.jellyfish} {input.hap} && - kat plot spectra-cn -x 200 -o {params.out_dir}.katplot.png {params.path}-main.mx + kat comp -o {params.out_dir} -t {threads} -m {params.km_size} --output_type png -v {input.jellyfish} {input.hap} && + kat plot spectra-cn -x 200 -o {params.out_dir}.katplot.png {params.out_dir}-main.mx """ -# NOT TESTED # Creates a k-mer database from the input reads rule meryl: input: lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: - os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "merqury", "{sample}_reads-db.meryl") + directory(os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "meryl", "{sample}_reads-db.meryl")) params: km_size = get_kmer_size threads: 20 @@ -312,10 +332,9 @@ rule merqury: # Run Merqury cd {params.out_dir} && \ export MERQURY=/usr/local/share/merqury && \ - merqury.sh {input.read_db} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap1.fasta.gz {params.prefix} + merqury.sh {input.km_database} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap1.fasta.gz {params.prefix} """ -# NOT TESTED # Identifies LTR retrotransposons rule LTR_finder: input: @@ -330,7 +349,6 @@ rule LTR_finder: shell: "ltr_finder -C {input} > {output}" -# NOT TESTED # Calculates the LTR Assembly Index (LAI), a metric for assembly quality based on LTR retrotransposons rule LTR_retriever: input: @@ -421,7 +439,10 @@ rule ragtag: output: dir(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold")) params: - out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold") + out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold"), + hap = rules.unpigz_to_fasta.output, + reference_genome = get_reference, + ragtag_bool = get_ragtag_bool threads: 4 resources: mem_mb = 80000, -- GitLab From 260db9a7a57ca60473fdeabfff72fe4fe7d1670c Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 10:18:58 +0100 Subject: [PATCH 109/178] Update lai generation --- workflow/Snakefile | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 81deb12..4e8bc1e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -359,6 +359,7 @@ rule LTR_retriever: recap = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl") params: prefix="{sample}_hap{n}", + scn="{sample}_hap{n}.scn", out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR") threads: 20 resources: @@ -369,13 +370,15 @@ rule LTR_retriever: shell: ''' export PATH="/opt/LTR_retriever:$PATH" && + cp {input.hap} {params.out_dir}/tmp_hap.fasta && cd {params.out_dir} && - LTR_retriever -threads {threads} -genome {input.hap} -infinder {input.scn} && - mv {params.prefix}.fa.out.LAI {output.lai} && - mv {params.prefix}.fa.tbl {output.recap} && + LTR_retriever -threads {threads} -genome tmp_hap.fasta -infinder {params.scn} && + mv {params.prefix}.fa.out.LAI {params.prefix}.out.LAI && + mv {params.prefix}.fa.tbl recap_{params.prefix}.tbl && rm {params.prefix}.fa?* && rm -rf .RepeatMaskerCache && rm {params.prefix}.fa + rm tmp_hap.fasta ''' # Rule to generate the html report @@ -439,8 +442,7 @@ rule ragtag: output: dir(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold")) params: - out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold"), - hap = rules.unpigz_to_fasta.output, + out_file = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold", "{sample}_scafold_hap{n}.fasta.gz"), reference_genome = get_reference, ragtag_bool = get_ragtag_bool threads: 4 @@ -451,5 +453,5 @@ rule ragtag: f"{container_registry}/ragtag:2.0.1" shell: """ - ./workflow/scripts/ragtag_call.sh + ./workflow/scripts/ragtag_call.sh {params.ragtag_bool} {params.hap} {params.out_file} {params.reference_genome} {output} {threads} """ -- GitLab From 1b76a3fa125f2f3f7816140294ce174ff0fb9655 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 10:19:55 +0100 Subject: [PATCH 110/178] add conditional ragtag execution --- workflow/scripts/ragtag_call.sh | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/workflow/scripts/ragtag_call.sh b/workflow/scripts/ragtag_call.sh index accb3b6..348f4d9 100644 --- a/workflow/scripts/ragtag_call.sh +++ b/workflow/scripts/ragtag_call.sh @@ -9,22 +9,15 @@ HAP_IN=$2 HAP_OUT=$3 REF=$4 DIRR=$6 - +THREADS=$7 if [[ "$RAGTAG" == "True" || "$RAGTAG" == "true" ]]; then - echo "Asm4pg -> Running ragtag" mkdir -p $DIRR - ragtag.py scaffold -o {params.out_dir} -t {threads} {input.reference} {input.assembly} && - + ragtag.py scaffold -o $DIRR -t $THREADS $REF $HAP_IN + mv $DIRR/ragtag.scaffold.fasta $HAP_OUT + mv $DIRR/ragtag.scaffold.paf $HAP_OUT else echo "Asm4pg -> Ragtag option is off" mkdir -p $DIRR - fi - - - - - mv {params.out_dir}/ragtag.scaffold.fasta {output.scaffold} && - mv {params.out_dir}/ragtag.scaffold.paf {output.alignment} \ No newline at end of file -- GitLab From a2be66e26c1667cad9031c9a0bb4256cb0639418 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 10:20:34 +0100 Subject: [PATCH 111/178] Simplify the nproc usage --- local_run.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/local_run.sh b/local_run.sh index f09cadf..71ee7ee 100755 --- a/local_run.sh +++ b/local_run.sh @@ -5,17 +5,16 @@ # 07/01/24 SNG_BIND=$(pwd) -CORES=$(nproc) run_snakemake() { local option="$1" case "$option" in dry) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -n -R all + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -n -R all ;; dag) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -R all --dag > dag.dot + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -R all --dag > dag.dot if [ $? -eq 0 ]; then echo "Asm4pg -> DAG has been successfully generated as dag.dot" else @@ -24,7 +23,7 @@ run_snakemake() { fi ;; run) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $CORES -R all #--forceall + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -R all #--forceall ;; *) echo "Invalid option: $option" -- GitLab From 841205bd7dabea1bca1cc5a5be8f29e1ddce2a3b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 10:20:53 +0100 Subject: [PATCH 112/178] Adapt job.sh to new workflow --- job.sh | 125 ++++++++++++++++++++++++++------------------------------- 1 file changed, 57 insertions(+), 68 deletions(-) diff --git a/job.sh b/job.sh index 0fb7b32..f469a92 100644 --- a/job.sh +++ b/job.sh @@ -1,76 +1,65 @@ #!/bin/bash -################################ Slurm options ################################# -### prepare_calling_jobs -#SBATCH -J smk_main -### Max run time "hours:minutes:seconds" -#SBATCH --time=96:00:00 -#SBATCH --ntasks=1 #nb of processes -#SBATCH --cpus-per-task=1 # nb of cores for each process(1 process) -#SBATCH --mem=10G # max of memory (-m) -### Requirements nodes/servers (default: 1) -#SBATCH --nodes=1 -### Requirements cpu/core/task (default: 1) -#SBATCH --ntasks-per-node=1 -#SBATCH -o slurm_logs/snakemake.%N.%j.out -#SBATCH -e slurm_logs/snakemake.%N.%j.err -#SBATCH --mail-type=END,FAIL -#SBATCH --mail-user=lucien.piat@inare.fr -################################################################################ +#SBATCH --cpus-per-task=1 +#SBATCH -o slurm_logs/out_job_%j.out +#SBATCH -e slurm_logs/err_job_%j.err +#SBATCH --time=80:00:00 +#SBATCH -J asm4pg +#SBATCH --mem=10G -# Useful information to print -echo '########################################' -echo 'Date:' $(date --iso-8601=seconds) -echo 'User:' $USER -echo 'Host:' $HOSTNAME -echo 'Job Name:' $SLURM_JOB_NAME -echo 'Job ID:' $SLURM_JOB_ID -echo 'Number of nodes assigned to job:' $SLURM_JOB_NUM_NODES -echo 'Total number of cores for job (?):' $SLURM_NTASKS -echo 'Number of requested cores per node:' $SLURM_NTASKS_PER_NODE -echo 'Nodes assigned to job:' $SLURM_JOB_NODELIST -echo 'Number of CPUs assigned for each task:' $SLURM_CPUS_PER_TASK -echo 'Directory:' $(pwd) -# Detail Information: -echo 'scontrol show job:' -scontrol show job $SLURM_JOB_ID -echo '########################################' +# Verify arguments +if [ $# -ne 1 ]; then + echo "Usage: $0 [dry|dag|run]" + echo " dry - run the specified Snakefile in dry-run mode" + echo " dag - generate DAG for the specified Snakefile" + echo " run - run the specified Snakefile normally" + exit 1 +fi -# Function to load modules -load_modules() { - module purge # Clear any previously loaded modules +# Update this with the path to your images +echo 'Loading modules' +module purge +module load containers/Apptainer/1.2.5 +module load devel/Miniconda/Miniconda3 - # Loop through each module and load it - for module_name in "$@"; do - module load "$module_name" - done -} +echo 'Activating environment' +source activate wf_env -# Here specify the modules to load and their path -load_modules "python/3.9.7" "snakemake/6.5.1" +echo 'Starting Snakemake workflow' -### variables -SNG_BIND="/mnt/cbib/pangenoak_trials/GenomAsm4pg/" -CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" -MAX_CORES=10 -PROFILE=".config/snakemake_profile/slurm" +run_snakemake() { + local option="$1" -echo 'Starting Snakemake workflow' + case "$option" in + dry) + snakemake -s "$snakefile" -c $(nproc) --dry-run + ;; + dag) + snakemake -c $(nproc) --dag > dag.dot + if [ $? -eq 0 ]; then + echo "Asm4pg -> DAG has been successfully generated as dag.dot" + else + echo "Asm4pg -> Error: Failed to generate DAG." + exit 1 + fi + ;; + run) + snakemake --workflow-profile ./.config/snakemake/profiles/slurm + ;; + *) + echo "Invalid option: $option" + echo "Usage: $0 [dry|dag|run]" + exit 1 + ;; + esac -### Snakemake commands -if [ "$1" = "dry" ] -then - # dry run - snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -n -r -elif [ "$1" = "dag" ] -then - # generate DAG - snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG --dag > dag.dot - echo "DAG has been generated as dag.png" -elif [ -z "$1" ] -then - # run - snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -else - echo "Error: Invalid argument. Use 'dry', 'dag', or no argument." >&2 - exit 1 -fi \ No newline at end of file + # Check if the Snakemake command was successful + if [ $? -eq 0 ]; then + echo "Asm4pg -> Snakemake workflow completed successfully." + else + echo "Asm4pg -> Error: Snakemake workflow execution failed." + exit 1 + fi +} + +# Execute the function with the provided option +run_snakemake "$1" -- GitLab From f70f68bfff64ac9ee5ae37531baa906f917f3085 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 10:49:00 +0100 Subject: [PATCH 113/178] add restart times to busco --- workflow/Snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index 4e8bc1e..ddd1b83 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -201,6 +201,7 @@ rule busco: lineage=get_busco_lin, sample="{sample}_hap{n}" threads: 20 + restart_times: 2 resources: mem_mb=100000, time="10:00:00" @@ -333,6 +334,7 @@ rule merqury: cd {params.out_dir} && \ export MERQURY=/usr/local/share/merqury && \ merqury.sh {input.km_database} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap1.fasta.gz {params.prefix} + rm {params.out_dir}/tmp_hap?.fasta.gz """ # Identifies LTR retrotransposons -- GitLab From 2987ddab9f83a780056582c7048732d236e207a1 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 10:50:31 +0100 Subject: [PATCH 114/178] ignore LTR cache --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 660a0f9..1316504 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ busco_downloads .cache results dag.dot +.RepeatMaskerCache \ No newline at end of file -- GitLab From 128796c1fa9ef523a5f3650e7fe4961fba5c23ad Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 11:02:14 +0100 Subject: [PATCH 115/178] Update cluster tutorial --- README.md | 16 +++++++++------- slurm_logs/.gitignore | 5 ----- 2 files changed, 9 insertions(+), 12 deletions(-) delete mode 100644 slurm_logs/.gitignore diff --git a/README.md b/README.md index 4c6908d..faebab3 100644 --- a/README.md +++ b/README.md @@ -35,23 +35,25 @@ git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git && cd GenomAsm4pg ### 2. Configure the pipeline - Edit the `masterconfig` file in the `.config/` directory with your sample information. -### 3. Run the workflow -#### A. On a HPC -Create a virtual environement for the workflow to run into : +### 3. Run the workflow +#### <ins>A. On a HPC</ins> +- Edit `job.sh` with your email and add path to the needed modules (`Singularity/Apptainer`, `Miniforge`) +- Provide the environement you created in `job.sh`, under `source activate wf_env`, you can create it like this : ```bash conda create -n wf_env -c conda-forge -c bioconda snakemake=8.4.7 snakemake-executor-plugin-slurm ``` > Use Miniforge with the conda-forge chanel, see why [here](https://science-ouverte.inrae.fr/fr/offre-service/fiches-pratiques-et-recommandations/quelles-alternatives-aux-fonctionnalites-payantes-danaconda) (french) - -- Edit `job.sh` with your email and add path to the needed modules (`Singularity/Apptainer`, `Miniforge`) -- Provide the environement you created in `job.sh`, under `source activate wf_env` +- Add the log directory for SLURM +```bash +mkdir slurm_logs +``` - Run the workflow : ```bash sbatch job.sh dry # Check for warnings sbatch job.sh run # Then ``` > **Nb 1:** If the your account name cant be automaticly determined, add it in the `.config/snakemake/profiles/slurm/config.yaml` file. -#### B. Localy +#### <ins>B. Localy</ins> - Make sure you have Snakemake and Singularity/Apptainer instaled - Run the workflow : ```bash diff --git a/slurm_logs/.gitignore b/slurm_logs/.gitignore deleted file mode 100644 index 9ef8e2c..0000000 --- a/slurm_logs/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -### Slurm needs an output for the log files -# Ignore everything in this directory -* -# Except this file -!.gitignore \ No newline at end of file -- GitLab From 10f519c918fdd91665ea1723be53d6bbd215f833 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 11:08:39 +0100 Subject: [PATCH 116/178] call default snakemake file --- job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/job.sh b/job.sh index f469a92..60c6918 100644 --- a/job.sh +++ b/job.sh @@ -31,7 +31,7 @@ run_snakemake() { case "$option" in dry) - snakemake -s "$snakefile" -c $(nproc) --dry-run + snakemake -c $(nproc) --dry-run ;; dag) snakemake -c $(nproc) --dag > dag.dot -- GitLab From 4a5cdca7d112ee7bad047c66e1a3c318ece2337d Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 11:12:20 +0100 Subject: [PATCH 117/178] Change restart time --- .config/snakemake/profiles/slurm/config.yaml | 1 + workflow/Snakefile | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.config/snakemake/profiles/slurm/config.yaml b/.config/snakemake/profiles/slurm/config.yaml index 24dbe1d..b5f0664 100644 --- a/.config/snakemake/profiles/slurm/config.yaml +++ b/.config/snakemake/profiles/slurm/config.yaml @@ -2,6 +2,7 @@ executor: slurm jobs: 10 use-singularity: true singularity-args: "--bind $(pwd)" +keep-going: True default-resources: #slurm_account: add if needed on your hpc diff --git a/workflow/Snakefile b/workflow/Snakefile index ddd1b83..575579a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -201,7 +201,6 @@ rule busco: lineage=get_busco_lin, sample="{sample}_hap{n}" threads: 20 - restart_times: 2 resources: mem_mb=100000, time="10:00:00" -- GitLab From e1b53bbf80ad11caa9a5e306e826a26612975a5a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 13:50:44 +0100 Subject: [PATCH 118/178] remove cpu per task --- .config/snakemake/profiles/slurm/config.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.config/snakemake/profiles/slurm/config.yaml b/.config/snakemake/profiles/slurm/config.yaml index b5f0664..7f38863 100644 --- a/.config/snakemake/profiles/slurm/config.yaml +++ b/.config/snakemake/profiles/slurm/config.yaml @@ -6,5 +6,4 @@ keep-going: True default-resources: #slurm_account: add if needed on your hpc - runtime: 60 - cpus_per_task: 1 \ No newline at end of file + runtime: 60 \ No newline at end of file -- GitLab From 1a794de1ae83b419d67709d27120aea634e52de1 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 13:50:59 +0100 Subject: [PATCH 119/178] add a quast calling script --- workflow/quast_call.sh | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 workflow/quast_call.sh diff --git a/workflow/quast_call.sh b/workflow/quast_call.sh new file mode 100644 index 0000000..b63cae7 --- /dev/null +++ b/workflow/quast_call.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Script to dynamically run quast on produced genomes +# Author: Lucien PIAT +# For: Project Pangenoak +# Date: January 6, 2025 + +# Arguments +REFERENCE_GENOME="$1" +PURGE_BOOL="$2" +RAGTAG_BOOL="$3" +RAW_HAP1="$4" +RAW_HAP2="$5" +FINAL_HAP1="$6" +FINAL_HAP2="$7" +RAGTAG_HAP1="$8" +RAGTAG_HAP2="$9" +OUTPUT_DIR="${10}" + +# Create the list of genomes to run quast on +genomes=("$FINAL_HAP1" "$FINAL_HAP2") + +if [ "$PURGE_BOOL" == "True" ]; then + genomes+=("$RAW_HAP1" "$RAW_HAP2") +fi + +if [ "$RAGTAG_BOOL" == "True" ]; then + genomes+=("$RAGTAG_HAP1" "$RAGTAG_HAP2") +fi + +# Build the quast command +quast_cmd="quast " +if [ "$REFERENCE_GENOME" != "None" ]; then + quast_cmd+="--reference $REFERENCE_GENOME " +fi +quast_cmd+="${genomes[@]} --output-dir $OUTPUT_DIR" + +# Run the quast command +eval $quast_cmd -- GitLab From 9bcd026adaa2d0ed227795ed371fd1ff62be9cba Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 13:51:15 +0100 Subject: [PATCH 120/178] Add quast rule --- workflow/Snakefile | 61 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 575579a..191457d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -11,7 +11,9 @@ import yaml container_registry = config.get("container_registry", "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg") output_dir = config.get("output_dir", "results/") - + +assembly_qc_folder = os.path.join(output_dir, "{sample}_results", "04_assembly_qc") + rule all: input: # Required final assemblies and graphs @@ -32,31 +34,31 @@ rule all: sample=config["samples"].keys() ), expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.qv"), + os.path.join(assembly_qc_folder, "merqury", "{sample}_merqury.qv"), sample=config["samples"].keys() ), expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats"), + os.path.join(assembly_qc_folder, "merqury", "{sample}_merqury.completeness.stats"), sample=config["samples"].keys() ), expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt"), + os.path.join(assembly_qc_folder, "hap{n}", "{sample}_hap{n}_genometools_stats.txt"), sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "telomeres", "{sample}_hap{n}_telomeres.txt"), + os.path.join(assembly_qc_folder, "hap{n}", "telomeres", "{sample}_hap{n}_telomeres.txt"), sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}.katplot.png"), + os.path.join(assembly_qc_folder, "hap{n}", "katplot", "{sample}_hap{n}.katplot.png"), sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "{sample}_hap{n}.out.LAI"), + os.path.join(assembly_qc_folder, "hap{n}", "LTR", "{sample}_hap{n}.out.LAI"), sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl"), + os.path.join(assembly_qc_folder, "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl"), sample=config["samples"].keys(), n=[1, 2] ) @@ -88,15 +90,12 @@ rule hifiasm: # Convert the gfa files of hifiasm to fasta TO_FA_CMD = r"""/^S/{print ">"$2;print $3}""" - rule pigz_gfa_to_fasta: input: - hap1_gfa = rules.hifiasm.output.hap1, - hap2_gfa = rules.hifiasm.output.hap2, + gfa = os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}.bp.hap{n}.p_ctg.gfa"), reads = lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: - hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap1.fasta.gz"), - hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}_hap2.fasta.gz") + fasta = os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap{n}.fasta.gz") threads: 4 resources: mem_mb=25000, @@ -105,10 +104,10 @@ rule pigz_gfa_to_fasta: f"{container_registry}/pigz" shell: """ - awk {TO_FA_CMD:q} {input.hap1_gfa} | pigz -p {threads} > {output.hap1} && - awk {TO_FA_CMD:q} {input.hap2_gfa} | pigz -p {threads} > {output.hap2} + awk {TO_FA_CMD:q} {input.gfa} | pigz -p {threads} > {output.fasta} """ + # Potentialy purge the haplotigs using purge_dups rule haplotigs_handling: input: @@ -304,7 +303,6 @@ rule meryl: shell: "meryl k={params.km_size} threads={threads} count {input} output {output}" -# NOT TESTED # Calculates metrics like QV and completeness, providing a quantitative assessment of the genome assembly. rule merqury: input: @@ -350,6 +348,7 @@ rule LTR_finder: shell: "ltr_finder -C {input} > {output}" +# NOT TESTED # Calculates the LTR Assembly Index (LAI), a metric for assembly quality based on LTR retrotransposons rule LTR_retriever: input: @@ -382,6 +381,7 @@ rule LTR_retriever: rm tmp_hap.fasta ''' +# NOT TESTED # Rule to generate the html report rule generate_report: input: @@ -428,6 +428,7 @@ rule generate_report: script: "../scripts/report.Rmd" +# NOT TESTED # Rule to relocate the report rule relocate_report: input: @@ -437,6 +438,8 @@ rule relocate_report: shell: "mv {input} {output}" +# NOT TESTED +# Rule to create scafold on assemblies rule ragtag: input: rules.unpigz_to_fasta.output @@ -456,3 +459,29 @@ rule ragtag: """ ./workflow/scripts/ragtag_call.sh {params.ragtag_bool} {params.hap} {params.out_file} {params.reference_genome} {output} {threads} """ + +# NOT TESTED +# Rule to create a quast report assessing the quality of all assemblies +rule quast: + input: + raw_hap1=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fasta.gz"), + raw_hap2=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fasta.gz"), + final_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), + final_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), + rules.ragtag.output + params: + reference_genome=get_reference, + purge_bool=get_purge_bool, + ragtag_bool=get_ragtag_bool, + ragtag_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", "{sample}_scafold_hap1.fasta.gz"), + ragtag_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", "{sample}_scafold_hap2.fasta.gz") + output: + quast_output=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") + container: + f"{container_registry}/staphb/quast:5.2.0" + shell: + """ + bash ./workflow/scripts/quast_call.sh "{params.reference_genome}" "{params.purge_bool}" "{params.ragtag_bool}" \ + "{input.raw_hap1}" "{input.raw_hap2}" "{input.final_hap1}" "{input.final_hap2}" \ + "{params.ragtag_hap1}" "{params.ragtag_hap2}" "{output.quast_output}" + """ -- GitLab From 19352665262e7d717b48580d8f52b4992ce2344b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 13:55:27 +0100 Subject: [PATCH 121/178] update quast rule --- workflow/Snakefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 191457d..223fc0a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -469,14 +469,14 @@ rule quast: final_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), final_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), rules.ragtag.output - params: - reference_genome=get_reference, - purge_bool=get_purge_bool, - ragtag_bool=get_ragtag_bool, - ragtag_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", "{sample}_scafold_hap1.fasta.gz"), - ragtag_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", "{sample}_scafold_hap2.fasta.gz") output: quast_output=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") + params: + ragtag_hap1 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", "{sample}_scafold_hap1.fasta.gz"), + ragtag_hap2 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", "{sample}_scafold_hap2.fasta.gz"), + reference_genome = get_reference, + purge_bool = get_purge_bool, + ragtag_bool = get_ragtag_bool container: f"{container_registry}/staphb/quast:5.2.0" shell: -- GitLab From 308e26cbc5e83fbc14bec6f8ef274235c31491f5 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 14:59:36 +0100 Subject: [PATCH 122/178] ignore slurm logs --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 1316504..11626a9 100644 --- a/.gitignore +++ b/.gitignore @@ -16,11 +16,11 @@ !*/ node_modules node_modules/* -!slurm_logs/* !workflow/* .snakemake busco_downloads .cache results dag.dot -.RepeatMaskerCache \ No newline at end of file +.RepeatMaskerCache +slurm_logs \ No newline at end of file -- GitLab From f676686087b272aefae36593d52f98f06420bbdf Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 15:00:28 +0100 Subject: [PATCH 123/178] move call script --- workflow/{ => scripts}/quast_call.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename workflow/{ => scripts}/quast_call.sh (100%) diff --git a/workflow/quast_call.sh b/workflow/scripts/quast_call.sh similarity index 100% rename from workflow/quast_call.sh rename to workflow/scripts/quast_call.sh -- GitLab From 8f05c41a790be5a44bc39f431e876cbf449c22b2 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 15:01:47 +0100 Subject: [PATCH 124/178] fix ragtag gz output --- .config/masterconfig.yaml | 9 +++++++ workflow/Snakefile | 47 +++++++++++---------------------- workflow/scripts/ragtag_call.sh | 3 ++- workflow/unimplemented.smk | 28 +++++++++++++++++++- 4 files changed, 53 insertions(+), 34 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 29bdd03..bbced5e 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -7,3 +7,12 @@ samples: container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" output_dir: "results/" + +# run2: +# fasta_gz: small_example.fasta.gz +# assembly_purge_force: 2 +# mode: default +# run_purge_dups: True +# busco_lineage: eudicots_odb10 +# ploidy: 2 +# kmer_size: 21 \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 223fc0a..ef90039 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -33,6 +33,10 @@ rule all: os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), sample=config["samples"].keys() ), + expand( + os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "summary.txt"), + sample=config["samples"].keys() + ), expand( os.path.join(assembly_qc_folder, "merqury", "{sample}_merqury.qv"), sample=config["samples"].keys() @@ -60,6 +64,10 @@ rule all: expand( os.path.join(assembly_qc_folder, "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl"), sample=config["samples"].keys(), n=[1, 2] + ), + expand( + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "short_summary.specific.{sample}_hap{n}.txt"), + sample=config["samples"].keys(), n=[1, 2] ) # Genome assembly using hifiasm @@ -251,7 +259,8 @@ rule genomescope: input: rules.jellyfish.output.histo output: - plot = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png") + plot = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), + summary = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "summary.txt") params: ploidy = get_ploidy, km_size = get_kmer_size, @@ -373,10 +382,9 @@ rule LTR_retriever: cp {input.hap} {params.out_dir}/tmp_hap.fasta && cd {params.out_dir} && LTR_retriever -threads {threads} -genome tmp_hap.fasta -infinder {params.scn} && - mv {params.prefix}.fa.out.LAI {params.prefix}.out.LAI && - mv {params.prefix}.fa.tbl recap_{params.prefix}.tbl && - rm {params.prefix}.fa?* && - rm -rf .RepeatMaskerCache && + mv tmp_hap{params.prefix}.fasta.out.LAI {params.prefix}.out.LAI && + mv tmp_hap{params.prefix}.fasta.tbl recap_{params.prefix}.tbl && + rm tmp_hap{params.prefix}.fa?* && rm {params.prefix}.fa rm tmp_hap.fasta ''' @@ -386,6 +394,7 @@ rule LTR_retriever: rule generate_report: input: genomescope = rules.genomescope.output.plot, + genomescope_sum = rules.genomescope.output.summary, genometools_on_raw_data = rules.genometools_on_raw_data.output, genometools_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), @@ -458,30 +467,4 @@ rule ragtag: shell: """ ./workflow/scripts/ragtag_call.sh {params.ragtag_bool} {params.hap} {params.out_file} {params.reference_genome} {output} {threads} - """ - -# NOT TESTED -# Rule to create a quast report assessing the quality of all assemblies -rule quast: - input: - raw_hap1=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fasta.gz"), - raw_hap2=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fasta.gz"), - final_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), - final_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), - rules.ragtag.output - output: - quast_output=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") - params: - ragtag_hap1 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", "{sample}_scafold_hap1.fasta.gz"), - ragtag_hap2 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", "{sample}_scafold_hap2.fasta.gz"), - reference_genome = get_reference, - purge_bool = get_purge_bool, - ragtag_bool = get_ragtag_bool - container: - f"{container_registry}/staphb/quast:5.2.0" - shell: - """ - bash ./workflow/scripts/quast_call.sh "{params.reference_genome}" "{params.purge_bool}" "{params.ragtag_bool}" \ - "{input.raw_hap1}" "{input.raw_hap2}" "{input.final_hap1}" "{input.final_hap2}" \ - "{params.ragtag_hap1}" "{params.ragtag_hap2}" "{output.quast_output}" - """ + """ \ No newline at end of file diff --git a/workflow/scripts/ragtag_call.sh b/workflow/scripts/ragtag_call.sh index 348f4d9..30b6270 100644 --- a/workflow/scripts/ragtag_call.sh +++ b/workflow/scripts/ragtag_call.sh @@ -15,7 +15,8 @@ if [[ "$RAGTAG" == "True" || "$RAGTAG" == "true" ]]; then echo "Asm4pg -> Running ragtag" mkdir -p $DIRR ragtag.py scaffold -o $DIRR -t $THREADS $REF $HAP_IN - mv $DIRR/ragtag.scaffold.fasta $HAP_OUT + gzip $DIRR/ragtag.scaffold.fasta + mv $DIRR/ragtag.scaffold.fasta.gz $HAP_OUT mv $DIRR/ragtag.scaffold.paf $HAP_OUT else echo "Asm4pg -> Ragtag option is off" diff --git a/workflow/unimplemented.smk b/workflow/unimplemented.smk index c6be40b..2a57718 100644 --- a/workflow/unimplemented.smk +++ b/workflow/unimplemented.smk @@ -152,4 +152,30 @@ rule no_purge_report_trio: container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: - "../scripts/report_trio.Rmd" \ No newline at end of file + "../scripts/report_trio.Rmd" + +# NOT TESTED +# Rule to create a quast report assessing the quality of all assemblies +rule quast: + input: + raw_hap1=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fasta.gz"), + raw_hap2=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fasta.gz"), + final_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), + final_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), + rules.ragtag.output + output: + quast_output=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") + params: + ragtag_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", "{sample}_scafold_hap1.fasta.gz"), + ragtag_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", "{sample}_scafold_hap2.fasta.gz"), + reference_genome=get_reference, + purge_bool=get_purge_bool, + ragtag_bool=get_ragtag_bool + container: + f"{container_registry}/staphb/quast:5.2.0" + shell: + """ + bash ./workflow/scripts/quast_call.sh "{params.reference_genome}" "{params.purge_bool}" "{params.ragtag_bool}" \ + "{input.raw_hap1}" "{input.raw_hap2}" "{input.final_hap1}" "{input.final_hap2}" \ + "{params.ragtag_hap1}" "{params.ragtag_hap2}" "{output.quast_output}" + """ \ No newline at end of file -- GitLab From 76a81c040dc331aedb87e898c370828fc2f55ca3 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 16:44:53 +0100 Subject: [PATCH 125/178] Update scalfolding and report rules --- workflow/Snakefile | 72 +++++++--------------------------------------- 1 file changed, 10 insertions(+), 62 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index ef90039..572a706 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -22,52 +22,12 @@ rule all: sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "cutoffs_graph_hap{n}.png"), + os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "ragtag_scafold", "recap.txt"), sample=config["samples"].keys(), n=[1, 2] ), expand( - os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), + os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html"), sample=config["samples"].keys() - ), - expand( - os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), - sample=config["samples"].keys() - ), - expand( - os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "summary.txt"), - sample=config["samples"].keys() - ), - expand( - os.path.join(assembly_qc_folder, "merqury", "{sample}_merqury.qv"), - sample=config["samples"].keys() - ), - expand( - os.path.join(assembly_qc_folder, "merqury", "{sample}_merqury.completeness.stats"), - sample=config["samples"].keys() - ), - expand( - os.path.join(assembly_qc_folder, "hap{n}", "{sample}_hap{n}_genometools_stats.txt"), - sample=config["samples"].keys(), n=[1, 2] - ), - expand( - os.path.join(assembly_qc_folder, "hap{n}", "telomeres", "{sample}_hap{n}_telomeres.txt"), - sample=config["samples"].keys(), n=[1, 2] - ), - expand( - os.path.join(assembly_qc_folder, "hap{n}", "katplot", "{sample}_hap{n}.katplot.png"), - sample=config["samples"].keys(), n=[1, 2] - ), - expand( - os.path.join(assembly_qc_folder, "hap{n}", "LTR", "{sample}_hap{n}.out.LAI"), - sample=config["samples"].keys(), n=[1, 2] - ), - expand( - os.path.join(assembly_qc_folder, "hap{n}", "LTR", "recap_{sample}_hap{n}.tbl"), - sample=config["samples"].keys(), n=[1, 2] - ), - expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "short_summary.specific.{sample}_hap{n}.txt"), - sample=config["samples"].keys(), n=[1, 2] ) # Genome assembly using hifiasm @@ -382,11 +342,8 @@ rule LTR_retriever: cp {input.hap} {params.out_dir}/tmp_hap.fasta && cd {params.out_dir} && LTR_retriever -threads {threads} -genome tmp_hap.fasta -infinder {params.scn} && - mv tmp_hap{params.prefix}.fasta.out.LAI {params.prefix}.out.LAI && - mv tmp_hap{params.prefix}.fasta.tbl recap_{params.prefix}.tbl && - rm tmp_hap{params.prefix}.fa?* && - rm {params.prefix}.fa - rm tmp_hap.fasta + mv tmp_hap.fasta.out.LAI {params.prefix}.out.LAI && + mv tmp_hap.fasta.tbl recap_{params.prefix}.tbl ''' # NOT TESTED @@ -437,25 +394,16 @@ rule generate_report: script: "../scripts/report.Rmd" -# NOT TESTED -# Rule to relocate the report -rule relocate_report: - input: - rules.generate_report.output - output: - os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html"), - shell: - "mv {input} {output}" - # NOT TESTED # Rule to create scafold on assemblies -rule ragtag: +rule scafolding: input: - rules.unpigz_to_fasta.output + os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "{sample}_final_hap{n}.fasta") output: - dir(os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold")) + os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "ragtag_scafold", "recap.txt") params: - out_file = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","ragtag_scafold", "{sample}_scafold_hap{n}.fasta.gz"), + out_file = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "ragtag_scafold", "{sample}_scafold_hap{n}.fasta.gz"), + out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "ragtag_scafold"), reference_genome = get_reference, ragtag_bool = get_ragtag_bool threads: 4 @@ -466,5 +414,5 @@ rule ragtag: f"{container_registry}/ragtag:2.0.1" shell: """ - ./workflow/scripts/ragtag_call.sh {params.ragtag_bool} {params.hap} {params.out_file} {params.reference_genome} {output} {threads} + ./workflow/scripts/ragtag_call.sh {params.ragtag_bool} {params.out_dir} {threads} {params.reference_genome} {input} {params.out_file} {output} """ \ No newline at end of file -- GitLab From 373393dbe3e25869ba19ff7f6e8c7df90a333ecb Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 10 Jan 2025 16:45:09 +0100 Subject: [PATCH 126/178] Update the scafolding to output a recap --- workflow/scripts/ragtag_call.sh | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/workflow/scripts/ragtag_call.sh b/workflow/scripts/ragtag_call.sh index 30b6270..9f56f1a 100644 --- a/workflow/scripts/ragtag_call.sh +++ b/workflow/scripts/ragtag_call.sh @@ -5,20 +5,33 @@ # Date: January 6, 2025 RAGTAG=$1 -HAP_IN=$2 -HAP_OUT=$3 +DIRR=$2 +THREADS=$3 REF=$4 -DIRR=$6 -THREADS=$7 +HAP_IN=$5 +HAP_OUT=$6 +RECAP=$7 + +# Echo parameters into the recap file +echo "RAGTAG: $RAGTAG" > $RECAP +echo "DIRR: $DIRR" >> $RECAP +echo "THREADS: $THREADS" >> $RECAP +echo "REF: $REF" >> $RECAP +echo "HAP_IN: $HAP_IN" >> $RECAP +echo "HAP_OUT: $HAP_OUT" >> $RECAP if [[ "$RAGTAG" == "True" || "$RAGTAG" == "true" ]]; then echo "Asm4pg -> Running ragtag" + echo "Ragtag execution started" >> $RECAP mkdir -p $DIRR + ragtag.py scaffold -o $DIRR -t $THREADS $REF $HAP_IN gzip $DIRR/ragtag.scaffold.fasta mv $DIRR/ragtag.scaffold.fasta.gz $HAP_OUT - mv $DIRR/ragtag.scaffold.paf $HAP_OUT + echo "Ragtag execution completed" >> $RECAP + echo "Output file: $HAP_OUT" >> $RECAP else echo "Asm4pg -> Ragtag option is off" + echo "Ragtag option is off" >> $RECAP mkdir -p $DIRR fi -- GitLab From 2d6aa60423ea6cdec013e78b0d8e6bdaa6c7766d Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 10:27:54 +0100 Subject: [PATCH 127/178] add note for apptainer --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index faebab3..654091e 100644 --- a/README.md +++ b/README.md @@ -25,13 +25,14 @@ This workflow uses [Snakemake](https://snakemake.readthedocs.io/en/stable/) to q ## Requirement Miniforge, Singularity/Apptainer, Snakemake + ## How to Use ### 1. Set up Clone the Git repository ```bash git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git && cd GenomAsm4pg ``` - +> All other tools will be ran in Singularity/Apptainer images automaticly downloaded by Snakemake. Total size of the images is ~5.5G ### 2. Configure the pipeline - Edit the `masterconfig` file in the `.config/` directory with your sample information. -- GitLab From 9da795d6b95279f267061e1a00b77b910188b56c Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 15:15:25 +0100 Subject: [PATCH 128/178] Fix Busco output quast rule --- workflow/Snakefile | 55 +++++++++++++++++++++++++++++++------- workflow/unimplemented.smk | 25 ----------------- 2 files changed, 45 insertions(+), 35 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 572a706..e9b927c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -162,7 +162,7 @@ rule busco: input: rules.unpigz_to_fasta.output output: - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "short_summary.specific.{sample}_hap{n}.txt") + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "busco_{sample}_hap{n}.txt") params: prefix = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco"), lineage=get_busco_lin, @@ -174,7 +174,12 @@ rule busco: container: f"{container_registry}/busco:5.7.1" shell: - "busco -f -i {input} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads}" + """ + busco -f -i {input} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads} && + echo "Aasm4pg -> cleaning busco output files" && + mv {params.prefix}/{params.sample}/short_summary.specific.{params.lineage}.{params.sample}.txt {output} && + rm -rf {params.prefix}/{params.sample} + """ # Estimate telomeric region content rule find_telomeres: @@ -210,7 +215,7 @@ rule jellyfish: """ echo "Aasm4pg -> starting jellyfish counts of kmers of size {params.km_size}" && jellyfish count -m {params.km_size} -s 100M -t {threads} -o {output.jf} -C <(zcat {input.reads}) && - echo "Aasm4pg -> starting to compute jellyfish histogram" + echo "Aasm4pg -> starting to compute jellyfish histogram" && jellyfish histo -h 1000000 -t {threads} {output.jf} > {output.histo} """ @@ -283,7 +288,8 @@ rule merqury: stats = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats") params: prefix = "{sample}_merqury", - out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury") + out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury"), + hap = "{n}" threads: 20 resources: mem_mb=60000, @@ -299,8 +305,9 @@ rule merqury: # Run Merqury cd {params.out_dir} && \ export MERQURY=/usr/local/share/merqury && \ - merqury.sh {input.km_database} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap1.fasta.gz {params.prefix} - rm {params.out_dir}/tmp_hap?.fasta.gz + merqury.sh {input.km_database} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap2.fasta.gz {params.prefix} + rm {params.out_dir}/tmp_hap1.fasta.gz + rm {params.out_dir}/tmp_hap2.fasta.gz """ # Identifies LTR retrotransposons @@ -340,10 +347,12 @@ rule LTR_retriever: ''' export PATH="/opt/LTR_retriever:$PATH" && cp {input.hap} {params.out_dir}/tmp_hap.fasta && - cd {params.out_dir} && + cd {params.out_dir} && LTR_retriever -threads {threads} -genome tmp_hap.fasta -infinder {params.scn} && mv tmp_hap.fasta.out.LAI {params.prefix}.out.LAI && - mv tmp_hap.fasta.tbl recap_{params.prefix}.tbl + mv tmp_hap.fasta.tbl recap_{params.prefix}.tbl && + echo "Asm4pg -> Cleaning LTR_retriever tmp output files" && + rm tmp* ''' # NOT TESTED @@ -357,8 +366,8 @@ rule generate_report: genometools_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), genometools_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), - busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "short_summary.specific.{sample}_hap1.txt"), - busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "short_summary.specific.{sample}_hap2.txt"), + busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt"), + busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt"), kplot_hap1 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), kplot_hap2 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), @@ -415,4 +424,30 @@ rule scafolding: shell: """ ./workflow/scripts/ragtag_call.sh {params.ragtag_bool} {params.out_dir} {threads} {params.reference_genome} {input} {params.out_file} {output} + """ + +# NOT TESTED +# Rule to create a quast report assessing the quality of all assemblies +rule quast: + input: + raw_hap1=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fasta.gz"), + raw_hap2=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fasta.gz"), + final_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), + final_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), + scafold = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "ragtag_scafold", "recap.txt") + output: + quast_output=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") + params: + ragtag_hap1=lambda wildcards: os.path.join(output_dir, f"{wildcards.sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", f"{wildcards.sample}_scafold_hap1.fasta.gz"), + ragtag_hap2=lambda wildcards: os.path.join(output_dir, f"{wildcards.sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", f"{wildcards.sample}_scafold_hap2.fasta.gz"), + reference_genome=lambda wildcards: get_reference(wildcards.sample), + purge_bool=lambda wildcards: get_purge_bool(wildcards.sample), + ragtag_bool=lambda wildcards: get_ragtag_bool(wildcards.sample) + container: + f"{container_registry}/staphb/quast:5.2.0" + shell: + """ + bash ./workflow/scripts/quast_call.sh {params.reference_genome} {params.purge_bool} {params.ragtag_bool} \ + {input.raw_hap1} {input.raw_hap2} {input.final_hap1} {input.final_hap2} \ + {params.ragtag_hap1} {params.ragtag_hap2} output.quast_output} """ \ No newline at end of file diff --git a/workflow/unimplemented.smk b/workflow/unimplemented.smk index 2a57718..6dd5f07 100644 --- a/workflow/unimplemented.smk +++ b/workflow/unimplemented.smk @@ -154,28 +154,3 @@ rule no_purge_report_trio: script: "../scripts/report_trio.Rmd" -# NOT TESTED -# Rule to create a quast report assessing the quality of all assemblies -rule quast: - input: - raw_hap1=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fasta.gz"), - raw_hap2=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fasta.gz"), - final_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), - final_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), - rules.ragtag.output - output: - quast_output=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") - params: - ragtag_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", "{sample}_scafold_hap1.fasta.gz"), - ragtag_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", "{sample}_scafold_hap2.fasta.gz"), - reference_genome=get_reference, - purge_bool=get_purge_bool, - ragtag_bool=get_ragtag_bool - container: - f"{container_registry}/staphb/quast:5.2.0" - shell: - """ - bash ./workflow/scripts/quast_call.sh "{params.reference_genome}" "{params.purge_bool}" "{params.ragtag_bool}" \ - "{input.raw_hap1}" "{input.raw_hap2}" "{input.final_hap1}" "{input.final_hap2}" \ - "{params.ragtag_hap1}" "{params.ragtag_hap2}" "{output.quast_output}" - """ \ No newline at end of file -- GitLab From e80b2b1917116b3d175e219102312b3ddaadbbf0 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 15:16:33 +0100 Subject: [PATCH 129/178] Add commented block for unlocking with snakemake --- job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/job.sh b/job.sh index 60c6918..17ee3a0 100644 --- a/job.sh +++ b/job.sh @@ -43,7 +43,7 @@ run_snakemake() { fi ;; run) - snakemake --workflow-profile ./.config/snakemake/profiles/slurm + snakemake --workflow-profile ./.config/snakemake/profiles/slurm #--unlock ;; *) echo "Invalid option: $option" -- GitLab From 63f9b64e81c8214fa565012b704aa9b8fcae3408 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 15:17:19 +0100 Subject: [PATCH 130/178] Addapt the script to run on single node clusters --- local_run.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/local_run.sh b/local_run.sh index 71ee7ee..acc9300 100755 --- a/local_run.sh +++ b/local_run.sh @@ -1,8 +1,14 @@ #!/bin/bash -# Script to run locally, DO NOT USE AS IS ON A CLUSTER! + +## TMP config to run on the CBIB +#SBATCH --job-name=asm4pg +#SBATCH --ntasks=20 +#SBATCH --mem=100G +#SBATCH -o slurm_logs/out_job_%j.out +#SBATCH -e slurm_logs/err_job_%j.err # Written by Lucien Piat at INRAe -# 07/01/24 +# 07/01/25 SNG_BIND=$(pwd) @@ -11,10 +17,10 @@ run_snakemake() { case "$option" in dry) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -n -R all + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -n ;; dag) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -R all --dag > dag.dot + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) --dag > dag.dot if [ $? -eq 0 ]; then echo "Asm4pg -> DAG has been successfully generated as dag.dot" else @@ -23,7 +29,7 @@ run_snakemake() { fi ;; run) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -R all #--forceall + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) #--unlock ;; *) echo "Invalid option: $option" -- GitLab From 4a6352bd3024e3c60fb15a803f6b4ab872f820dc Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 15:21:53 +0100 Subject: [PATCH 131/178] Add more verbose to the script --- workflow/scripts/quast_call.sh | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) mode change 100644 => 100755 workflow/scripts/quast_call.sh diff --git a/workflow/scripts/quast_call.sh b/workflow/scripts/quast_call.sh old mode 100644 new mode 100755 index b63cae7..bda9c92 --- a/workflow/scripts/quast_call.sh +++ b/workflow/scripts/quast_call.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Script to dynamically run quast on produced genomes +# Script to dynamically run quast on produced genomes, with verbose output # Author: Lucien PIAT # For: Project Pangenoak # Date: January 6, 2025 @@ -17,22 +17,42 @@ RAGTAG_HAP2="$9" OUTPUT_DIR="${10}" # Create the list of genomes to run quast on +echo "Asm4pg -> Preparing genome list for QUAST analysis..." genomes=("$FINAL_HAP1" "$FINAL_HAP2") +echo " - Added final haplotypes: $FINAL_HAP1, $FINAL_HAP2" if [ "$PURGE_BOOL" == "True" ]; then genomes+=("$RAW_HAP1" "$RAW_HAP2") + echo " - Purge option enabled: added raw haplotypes: $RAW_HAP1, $RAW_HAP2" fi if [ "$RAGTAG_BOOL" == "True" ]; then genomes+=("$RAGTAG_HAP1" "$RAGTAG_HAP2") + echo " - RagTag option enabled: added RagTag haplotypes: $RAGTAG_HAP1, $RAGTAG_HAP2" fi # Build the quast command +echo "Asm4pg -> Building the QUAST command..." quast_cmd="quast " if [ "$REFERENCE_GENOME" != "None" ]; then + echo " - Reference genome specified: $REFERENCE_GENOME" quast_cmd+="--reference $REFERENCE_GENOME " fi +echo " - Genomes to process: ${genomes[@]}" quast_cmd+="${genomes[@]} --output-dir $OUTPUT_DIR" +# Verbose: Display the constructed command +echo "Asm4pg -> Constructed QUAST command:" +echo "$quast_cmd" + # Run the quast command +echo "Asm4pg -> Running QUAST..." eval $quast_cmd + +# Exit status check +if [ $? -eq 0 ]; then + echo "Asm4pg -> QUAST completed successfully." +else + echo "Asm4pg -> ERROR: QUAST encountered an issue. Check the output for details." + exit 1 +fi -- GitLab From 5e3840c7e0fd5c752e4b0e4b6076d91b860e2ea1 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 15:22:17 +0100 Subject: [PATCH 132/178] Add error handling and verbose --- workflow/scripts/ragtag_call.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) mode change 100644 => 100755 workflow/scripts/ragtag_call.sh diff --git a/workflow/scripts/ragtag_call.sh b/workflow/scripts/ragtag_call.sh old mode 100644 new mode 100755 index 9f56f1a..2ec8f1d --- a/workflow/scripts/ragtag_call.sh +++ b/workflow/scripts/ragtag_call.sh @@ -22,16 +22,20 @@ echo "HAP_OUT: $HAP_OUT" >> $RECAP if [[ "$RAGTAG" == "True" || "$RAGTAG" == "true" ]]; then echo "Asm4pg -> Running ragtag" - echo "Ragtag execution started" >> $RECAP + echo "Asm4pg -> Ragtag execution started" >> $RECAP mkdir -p $DIRR - ragtag.py scaffold -o $DIRR -t $THREADS $REF $HAP_IN - gzip $DIRR/ragtag.scaffold.fasta - mv $DIRR/ragtag.scaffold.fasta.gz $HAP_OUT - echo "Ragtag execution completed" >> $RECAP - echo "Output file: $HAP_OUT" >> $RECAP + if ragtag.py scaffold -o $DIRR -t $THREADS $REF $HAP_IN; then + gzip $DIRR/ragtag.scaffold.fasta + mv $DIRR/ragtag.scaffold.fasta.gz $HAP_OUT + echo "Asm4pg -> Ragtag execution completed" >> $RECAP + echo "Output file: $HAP_OUT" >> $RECAP + else + echo "Asm4pg -> Ragtag execution failed" >> $RECAP + exit 1 + fi else echo "Asm4pg -> Ragtag option is off" - echo "Ragtag option is off" >> $RECAP + echo "Asm4pg -> Ragtag option is off" >> $RECAP mkdir -p $DIRR fi -- GitLab From 15ef5a57f5bdbcb7f035a4ce8097a6956ee5fecd Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 16:50:50 +0100 Subject: [PATCH 133/178] Switch to rulegraph --- local_run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/local_run.sh b/local_run.sh index acc9300..05d0783 100755 --- a/local_run.sh +++ b/local_run.sh @@ -3,7 +3,7 @@ ## TMP config to run on the CBIB #SBATCH --job-name=asm4pg #SBATCH --ntasks=20 -#SBATCH --mem=100G +#SBATCH --mem=200G #SBATCH -o slurm_logs/out_job_%j.out #SBATCH -e slurm_logs/err_job_%j.err @@ -20,7 +20,7 @@ run_snakemake() { snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -n ;; dag) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) --dag > dag.dot + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) --rulegraph > dag.dot if [ $? -eq 0 ]; then echo "Asm4pg -> DAG has been successfully generated as dag.dot" else -- GitLab From d8a28560ae7d10b143c1e8973948cf98d6e3d433 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 16:52:03 +0100 Subject: [PATCH 134/178] Fix bugs in hifiasm and Quast rules --- workflow/Snakefile | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index e9b927c..cf96253 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -28,21 +28,27 @@ rule all: expand( os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html"), sample=config["samples"].keys() + ), + expand( + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast"), + sample=config["samples"].keys() ) - + # Genome assembly using hifiasm rule hifiasm: input: reads = lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: - hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap1.p_ctg.gfa"), - hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap2.p_ctg.gfa") + hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","raw_{sample}_hap1.gfa"), + hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","raw_{sample}_hap2.gfa") params: prefix = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}"), mode = get_mode, run_1 = lambda wildcards: get_run(wildcards, run=1), run_2 = lambda wildcards: get_run(wildcards, run=2), - purge_force = get_purge_force + purge_force = get_purge_force, + raw_out_hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap1.p_ctg.gfa"), + raw_out_hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap2.p_ctg.gfa") benchmark: os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hifiasm_benchmark.txt") threads: 20 @@ -53,14 +59,18 @@ rule hifiasm: f"{container_registry}/hifiasm:0.19.6" shell: """ - ./workflow/scripts/hifiasm_call.sh {params.mode} {params.purge_force} {threads} {input.reads} {params.run_1} {params.run_2} {params.prefix} + ./workflow/scripts/hifiasm_call.sh {params.mode} {params.purge_force} {threads} {input.reads} {params.run_1} {params.run_2} {params.prefix} && + echo "Aasm4pg -> Cleaning hifiasm output files" + mv {params.raw_out_hap1} {output.hap1} && + mv {params.raw_out_hap2} {output.hap2} && + rm {params.prefix}* """ # Convert the gfa files of hifiasm to fasta TO_FA_CMD = r"""/^S/{print ">"$2;print $3}""" rule pigz_gfa_to_fasta: input: - gfa = os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}.bp.hap{n}.p_ctg.gfa"), + gfa = os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "raw_{sample}_hap{n}.gfa"), reads = lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: fasta = os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap{n}.fasta.gz") @@ -156,7 +166,6 @@ use rule genometools_on_raw_data as genometools_on_assembly with: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt") priority: 0 -# NOT TESTED # BUSCO stats on assembly (may not work on first run, rerun the WF) rule busco: input: @@ -288,8 +297,7 @@ rule merqury: stats = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats") params: prefix = "{sample}_merqury", - out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury"), - hap = "{n}" + out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury") threads: 20 resources: mem_mb=60000, @@ -305,9 +313,9 @@ rule merqury: # Run Merqury cd {params.out_dir} && \ export MERQURY=/usr/local/share/merqury && \ - merqury.sh {input.km_database} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap2.fasta.gz {params.prefix} - rm {params.out_dir}/tmp_hap1.fasta.gz - rm {params.out_dir}/tmp_hap2.fasta.gz + merqury.sh {input.km_database} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap2.fasta.gz {params.prefix} && + rm {params.out_dir}/tmp_hap* && + rm {params.out_dir}/*reads-db* """ # Identifies LTR retrotransposons @@ -434,20 +442,21 @@ rule quast: raw_hap2=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fasta.gz"), final_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), final_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), - scafold = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "ragtag_scafold", "recap.txt") + scafold1 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", "recap.txt"), + scafold2 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", "recap.txt") output: quast_output=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") params: ragtag_hap1=lambda wildcards: os.path.join(output_dir, f"{wildcards.sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", f"{wildcards.sample}_scafold_hap1.fasta.gz"), ragtag_hap2=lambda wildcards: os.path.join(output_dir, f"{wildcards.sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", f"{wildcards.sample}_scafold_hap2.fasta.gz"), - reference_genome=lambda wildcards: get_reference(wildcards.sample), - purge_bool=lambda wildcards: get_purge_bool(wildcards.sample), - ragtag_bool=lambda wildcards: get_ragtag_bool(wildcards.sample) + reference_genome=get_reference, + purge_bool=get_purge_bool, + ragtag_bool=get_ragtag_bool container: f"{container_registry}/staphb/quast:5.2.0" shell: """ bash ./workflow/scripts/quast_call.sh {params.reference_genome} {params.purge_bool} {params.ragtag_bool} \ {input.raw_hap1} {input.raw_hap2} {input.final_hap1} {input.final_hap2} \ - {params.ragtag_hap1} {params.ragtag_hap2} output.quast_output} + {params.ragtag_hap1} {params.ragtag_hap2} {output.quast_output} """ \ No newline at end of file -- GitLab From 59618f4c532f50137f6ba14958f03b5e70bf8eb9 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 16:52:38 +0100 Subject: [PATCH 135/178] Add yak and trio assembly option --- workflow/scripts/hifiasm_call.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/workflow/scripts/hifiasm_call.sh b/workflow/scripts/hifiasm_call.sh index 4a6a1d1..c09e2aa 100755 --- a/workflow/scripts/hifiasm_call.sh +++ b/workflow/scripts/hifiasm_call.sh @@ -33,9 +33,22 @@ case "$MODE" in hi-c) echo "Asm4pg -> Running hifiasm in hi-c mode..." hifiasm -l${PURGE_FORCE} -o ${PREFIX} -t ${THREADS} --h1 ${RUN_1} --h2 ${RUN_2} ${INPUT} + echo "Asm4pg -> Renaming hifiasm output files" mv ${PREFIX}.hic.hap1.p_ctg.gfa ${PREFIX}.bp.hap1.p_ctg.gfa mv ${PREFIX}.hic.hap2.p_ctg.gfa ${PREFIX}.bp.hap2.p_ctg.gfa ;; + trio) + echo "Asm4pg -> Hifiasm called in trio mode..." + echo "Asm4pg -> Generating yak file for parent 1 ($RUN_1)" + yak count -k31 -b37 -t16 -o ${PREFIX}/yak/parent1.yak ${RUN_1} + echo "Asm4pg -> Generating yak file for parent 1 ($RUN_2)" + yak count -k31 -b37 -t16 -o ${PREFIX}/yak/parent2.yak ${RUN_2} + echo "Asm4pg -> Running hifiasm in trio mode..." + hifiasm -o ${PREFIX} -t ${THREADS} -1 ${PREFIX}/yak/parent1.yak -2 ${PREFIX}/yak/parent2.yak ${INPUT} + echo "Asm4pg -> Renaming hifiasm output files" + mv ${PREFIX}.dip.hap1.p_ctg.gfa ${PREFIX}.bp.hap1.p_ctg.gfa + mv ${PREFIX}.dip.hap2.p_ctg.gfa ${PREFIX}.bp.hap2.p_ctg.gfa + ;; *) echo "Asm4pg -> Unknown hifiasm mode: $MODE" ;; -- GitLab From 85ec55a757d8aa0fa642a5cbe78b4dc94b66b775 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 16:53:09 +0100 Subject: [PATCH 136/178] remove yak, longqc and fastqc --- workflow/unimplemented.smk | 71 +------------------------------------- 1 file changed, 1 insertion(+), 70 deletions(-) diff --git a/workflow/unimplemented.smk b/workflow/unimplemented.smk index 6dd5f07..f4e3a87 100644 --- a/workflow/unimplemented.smk +++ b/workflow/unimplemented.smk @@ -1,48 +1,3 @@ -### QC on .bam files with LongQC -rule multiqc: - output: - res_path + "/{runid}/multiqc/{id}_multiqc.html" - params: - indir = res_path + "/{runid}", - name = "{id}_multiqc", - out = res_path + "/{runid}/multiqc" - container: - "docker://ewels/multiqc" - shell: - "multiqc {params.indir} --filename {params.name} --outdir {params.out} --ignore \"*multiqc*\" -d -dd 1 -f" - -rule longqc: - input: - abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{Bid}.bam" - output: - directory(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC") - benchmark: - res_path + "/{Bid}/{run}/benchmark/longqc.txt" - priority: 1 - threads: 8 - resources: - mem_mb=60000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/longqc1.2.0c" - shell: - "longQC sampleqc -x pb-hifi -o {output} {input}" - -### QC on .fastq.gz files with FastQC -rule fastqc: - input: - get_fastq - output: - multiext(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc", ".html", ".zip") - params: - output_path=res_path + "/{Fid}/{run}//01_raw_data_QC/01_fastQC/" - benchmark: - res_path + "/{Fid}/{run}/benchmark/fastqc.txt" - priority: 1 - threads: 4 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/fastqc:0.12.1" - shell: - "fastqc -o {params.output_path} {input}" rule meryl_trio: input: @@ -62,29 +17,6 @@ rule meryl_trio: "meryl k=21 count {input.p1} output {output.p1} && " "meryl k=21 count {input.p2} output {output.p2}" -rule cp_trio: - input: - hap1 = rules.hap_gfa_to_fasta.output.hap1_fa, - hap2 = rules.hap_gfa_to_fasta.output.hap2_fa - output: - hap1 = temp(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_hap1.fasta.gz"), - hap2 = temp(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_hap2.fasta.gz") - params: - path=res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury" - shell: - "cp {input.hap1} {output.hap1} && " - "cp {input.hap2} {output.hap2}" - -rule unzip: - input: - rules.cp_trio.output.hap1, - rules.cp_trio.output.hap2 - output: - hap1 = temp(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_hap1.fasta"), - hap2 = temp(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_hap2.fasta") - shell: - "unpigz -k -p 1 {input}" - rule merqury_trio: input: p1 = rules.meryl_trio.output.p1, @@ -152,5 +84,4 @@ rule no_purge_report_trio: container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" script: - "../scripts/report_trio.Rmd" - + "../scripts/report_trio.Rmd" \ No newline at end of file -- GitLab From ed314a1344db76e677f0842ec297460a6f9689f8 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 16:53:57 +0100 Subject: [PATCH 137/178] Update documentation for new workflow --- doc/Going-further.md | 6 ++-- doc/Known-errors.md | 9 ------ doc/Outputs.md | 48 -------------------------------- doc/Programs.md | 66 -------------------------------------------- doc/documentation.md | 23 ++++----------- doc/known_errors.md | 12 ++++++++ doc/outputs.md | 49 ++++++++++++++++++++++++++++++++ doc/software_list.md | 35 +++++++++++++++++++++++ 8 files changed, 103 insertions(+), 145 deletions(-) delete mode 100644 doc/Known-errors.md delete mode 100644 doc/Outputs.md delete mode 100644 doc/Programs.md create mode 100644 doc/known_errors.md create mode 100644 doc/outputs.md create mode 100644 doc/software_list.md diff --git a/doc/Going-further.md b/doc/Going-further.md index fb3f321..590e69f 100644 --- a/doc/Going-further.md +++ b/doc/Going-further.md @@ -1,8 +1,6 @@ # Going further - ## 01. In-depth options -### Job.sh options - +### Job.sh/local_run.sh options For a dry run ```bash sbatch job.sh dry @@ -13,7 +11,7 @@ sbatch job.sh dag ``` To run the workflow ```bash -sbatch job.sh +sbatch job.sh run ``` ## Workflow options diff --git a/doc/Known-errors.md b/doc/Known-errors.md deleted file mode 100644 index 5cc2cc6..0000000 --- a/doc/Known-errors.md +++ /dev/null @@ -1,9 +0,0 @@ -# Troubleshooting - -[TOC] - -## One of the BUSCO rules failed -The first time you run the workflow, the BUSCO lineage might be downloaded multiple times. This can create a conflict between the jobs using BUSCO and may interrupt some of them. In that case, you only need to rerun the workflow once everything is done. - -## Snakemake locked directory -When you try to rerun the workflow after cancelling a job, you may have to unlock the results directory. To do so, go in `.config/snakemake_profile/slurm` and uncomment line 14 of `config.yaml`. Run the workflow once to unlock the directory (it should only take a few seconds). Still in `config.yaml`, comment line 14. The workflow will be able to run and create outputs. diff --git a/doc/Outputs.md b/doc/Outputs.md deleted file mode 100644 index 6825893..0000000 --- a/doc/Outputs.md +++ /dev/null @@ -1,48 +0,0 @@ -# Workflow output - -[TOC] - -## Directories -There are three directories for the data produced by the workflow: -- An automatic report is generated in the `RUN` directory. -- `01_raw_data_QC` contains all quality control ran on the reads. FastQC and LongQC create HTML reports on fastq and bam files respectively, reads stats are given by Genometools, and predictions of genome size and heterozygosity are given by Genomescope (in directory `04_kmer`). -- `02_genome_assembly` contains 2 assemblies. The first one is in `01_raw_assembly`, it is the assembly obtained with hifiasm. The second one is in `02_after_purge_dups_assembly`, it is the hifiasm assembly after haplotigs removal by purge_dups. Both assemblies have a `01_assembly_QC` directory containing assembly statistics done by Genometools (in directory `assembly_stats`), BUSCO analyses (`busco`), k-mer profiles with KAT (`katplot`) and completedness and QV stats with Merqury (`merqury`) as well as assembled telomeres with FindTelomeres (`telomeres`). -- `benchmark` contains main programs runtime - -``` -workflow_results -├── 00_input_data -└── FILENAME - └── RUN - ├── 01_raw_data_QC - │ ├── 01_fastQC - │ ├── 02_longQC - │ ├── 03_genometools - | └── 04_kmer - | └── genomescope - └── 02_genome_assembly - ├── 01_raw_assembly - │ ├── 00_assembly - | └── 01_assembly_QC - | ├── assembly_stats - | ├── busco - | ├── katplot - | ├── merqury - | └── telomeres - └── 02_after_purge_dups_assembly (optional) - ├── 00_assembly - | ├── hap1 - | └── hap2 - └── 01_assembly_QC - ├── assembly_stats - ├── busco - ├── katplot - ├── merqury - └── telomeres -``` - -## Additional files -- Symbolic links to haplotype 1 and haplotype 2 assemblies after purge_dups -- HTML report with the main results from each program -- Runtime file with the total workflow runtime for the dataset -- Global QUAST report diff --git a/doc/Programs.md b/doc/Programs.md deleted file mode 100644 index 2b975a7..0000000 --- a/doc/Programs.md +++ /dev/null @@ -1,66 +0,0 @@ -# Workflow steps and program versions -All images here will be pulled automatically by Snakemake the first time you run the workflow. It may take some time. Images are only downloaded once and reused automatically by the workflow. -Images are stored on the project's container registry but come from various container libraries: - -## 1. Pre-assembly -- Conversion of PacBio bam to fasta & fastq - - [smrtlink](https://www.pacb.com/support/software-downloads/) 9.0.0 -- Fastq to fasta conversion - - [seqtk](https://github.com/lh3/seqtk) 1.3 -- Raw data quality control - - [fastqc](https://github.com/s-andrews/FastQC) 0.12.1 - - [lonqQC](https://github.com/yfukasawa/LongQC) 1.2.0c -- Metrics - - [genometools](https://github.com/genometools/genometools) 1.5.9 -- K-mer analysis - - [jellyfish](https://github.com/gmarcais/Jellyfish) 2.3.0 - - [genomescope](https://github.com/tbenavi1/genomescope2.0) 2.0 - -## 2. Assembly -- Assembly - - [hifiasm](https://github.com/chhylp123/hifiasm) 0.19.6 - - [YAK](https://github.com/lh3/yak) 0.1 -- Metrics - - [genometools](https://github.com/genometools/genometools) 1.5.9 -- Assembly quality control - - [BUSCO](https://gitlab.com/ezlab/busco) 5.7.1 - - [KAT](https://github.com/TGAC/KAT) 2.4.1 -- Error rate, QV & phasing - - [meryl](https://github.com/marbl/meryl) and [merqury](https://github.com/marbl/merqury) 1.3 -- Detect assembled telomeres - - [FindTelomeres](https://github.com/JanaSperschneider/FindTelomeres) - - **Biopython** 1.75 -- Haplotigs and overlaps purging - - [purge_dups](https://github.com/dfguan/purge_dups) 1.2.5 - - **matplotlib** 0.11.5 -- Repeted elements quantification - - [LTR_retriever](https://github.com/oushujun/LTR_retriever) 3.0.1 - - [LTR_Finder](https://github.com/xzhub/LTR_Finder) latest as of october 2024 - -## 3. Report -- **R markdown** 4.0.3 -- [QUAST](https://github.com/ablab/quast) 5.2.0 - -# Docker images -The programs are pulled automatically as images by Snakemake the first time you run the workflow. It may take some time. Images are only downloaded once and reused automatically by the workflow. -Images are stored on the project's container registry but come from various container libraries: - -- [smrtlink](https://hub.docker.com/r/bryce911/smrtlink/tags) -- [seqtk](https://hub.docker.com/r/nanozoo/seqtk) -- [fastqc](https://hub.docker.com/r/staphb/fastqc/tags) -- [lonqQC](https://hub.docker.com/r/grpiccoli/longqc/tags) -- [genometools](https://hub.docker.com/r/biocontainers/genometools/tags) -- [jellyfish](https://quay.io/repository/biocontainers/kmer-jellyfish?tab=tags) -- [genomescope](https://hub.docker.com/r/abner12/genomescope) -- hifiasm, custom -- yak, custom -- [BUSCO](https://hub.docker.com/r/ezlabgva/busco/tags) -- [KAT](https://quay.io/repository/biocontainers/kat) -- [meryl and merqury](https://quay.io/repository/biocontainers/merqury?tab=tags) -- [Biopython for FindTelomeres](https://quay.io/repository/biocontainers/biopython?tab=tags) -- [purge_dups](https://hub.docker.com/r/wangnan9394/purge_dups/tags) -- [matplotlib as companion to purge_dups](https://hub.docker.com/r/biocontainers/matplotlib-venn/tags) -- [R markdown](https://hub.docker.com/r/reslp/rmarkdown/tags) -- LTR_retriever -- LTR_Finder custom -- [QUAST](https://hub.docker.com/r/staphb/quast/tags) \ No newline at end of file diff --git a/doc/documentation.md b/doc/documentation.md index c133341..72016f5 100644 --- a/doc/documentation.md +++ b/doc/documentation.md @@ -4,28 +4,15 @@ Asm4pg is an automatic and reproducible genome assembly workflow for pangenomic doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/genomasm4pg) - - -## Asm4pg Requirements -- snakemake >= 6.5.1 -- singularity - -The workflow does not work with HPC that does not allow a job to run other jobs. - ## Tutorials -The three assembly modes from hifiasm are available. -- [Quick start (default mode)](doc/Quick-start.md) -- [Hi-C mode](doc/Assembly-Mode/Hi-C-tutorial.md) -- [Trio mode](doc/Assembly-Mode/Trio-tutorial.md) +### Runing in Hi-C mode -## Outputs -[Workflow outputs](doc/Outputs.md) +### All options -## Optional Data Preparation -If your [data is in a tarball](doc/Tar-data-preparation.md) +## Outputs ## Known errors -You may run into [these errors](doc/Known-errors.md) +You may run into [these errors](doc/known_errors.md) ## Softwares -[Softwares used in the workflow](doc/Programs.md) +[Softwares used in the workflow](doc/software_list.md) diff --git a/doc/known_errors.md b/doc/known_errors.md new file mode 100644 index 0000000..6e37f94 --- /dev/null +++ b/doc/known_errors.md @@ -0,0 +1,12 @@ +# Troubleshooting + +## One of the BUSCO rules failed +The first time you run the workflow, the BUSCO lineage might be downloaded multiple times. This can create a conflict between the jobs using BUSCO and may interrupt some of them. In that case, you only need to rerun the workflow once everything is done. + +## Snakemake locked directory +When you try to rerun the workflow after cancelling a job, you may have to unlock the results directory. To do so, go in `job.sh/local_run.sh` and uncomment `#--unlock`. Run the workflow once to unlock the directory (it should only take a few seconds). Still in `job.sh/local_run.sh`, re add the `#`. The workflow will be able to run and create outputs. + +## HPC problems +The workflow does not work with HPC that does not allow a job to run other jobs. You can still run the workflow with a dynamic session and ./local_run + +If the version of SLRUM in the HPC is old, you may run into this error `srun: unrecognized option '--cpu-bind=q'` this is a known SLURM/Snakemake issue and SLRUM needs to be updated (https://github.com/snakemake/snakemake/issues/2071) \ No newline at end of file diff --git a/doc/outputs.md b/doc/outputs.md new file mode 100644 index 0000000..669264a --- /dev/null +++ b/doc/outputs.md @@ -0,0 +1,49 @@ +# Workflow output + +## Directories +There are 4 directories for the data produced by the workflow: +- `01_raw_assembly` which contains the direct output of Hifiasm +- `02_final_assembly` which contains the assembled haplotypes that may have been purged of haplotigs and/or scafolded +- `03_raw_qc` which contains quality metrics for the reads. +- `04_assembly_qc` which contains quality metrics of the final assembly. + +## Files +```bash +results/ # Results folder containg all run +└── {sample}_results + ├── 01_raw_assembly # Raw assembly folder with gfa and fasta files + │  ├── {sample}_hap1.gfa + │ ├── {sample}_hap2.gfa + │  ├── {sample}_hap1.fasta.gz + │  ├── {sample}_hap2.fasta.gz + │  └──{sample}_hifiasm_benchmark.txt + ├── 02_final_assembly # Final assembly driectory with fasta file + │  ├── hap1 + │  │  ├── cutoffs + │  │  ├── ragtag_scafold # Driectory that contains scafolded haplotypes + │  │  │  └── recap.txt + │  │  └── {sample}_final_hap1.fasta.gz + │  └── hap2 + │  └──... + ├── 03_raw_data_qc # Driectory that contains QC on the reads + │  ├── genomescope + │  │  └── ... + │  ├── jellyfish + │  │  └── ... + │  └── {sample}_genometools_stats.txt + └── 04_assembly_qc # Driectory with QC for the assembled haplotypes (one per haplotype) + ├── hap1 + │  ├── busco + │  │  └── busco_{sample}_hap1.txt + │  ├── katplot + │  │  ├── ... + │  ├── LTR + │  │  ├── ... + │  ├── {sample}_hap1_genometools_stats.txt + │  └── telomeres + │  └── ... + ├── merqury + │  └── ... + └── meryl + └── ... +``` \ No newline at end of file diff --git a/doc/software_list.md b/doc/software_list.md new file mode 100644 index 0000000..1abbb5d --- /dev/null +++ b/doc/software_list.md @@ -0,0 +1,35 @@ +# Workflow steps and program versions +All images here will be pulled automatically by Snakemake the first time you run the workflow. It may take some time. Images are only downloaded once and reused automatically by the workflow. +Images are stored on the project's container registry : + +## 01. Assembly +- Assembly + - [hifiasm](https://github.com/chhylp123/hifiasm) 0.19.6 + - [YAK](https://github.com/lh3/yak) 0.1 +- Haplotigs and overlaps purging + - [purge_dups](https://github.com/dfguan/purge_dups) 1.2.5 + - **matplotlib** 0.11.5 +- Scafolding + - [RagTag](https://github.com/malonge/RagTag) + +## Quality Control +- K-mer analysis + - [jellyfish](https://github.com/gmarcais/Jellyfish) 2.3.0 + - [genomescope](https://github.com/tbenavi1/genomescope2.0) 2.0 +- Metrics + - [genometools](https://github.com/genometools/genometools) 1.5.9 +- Assembly quality control + - [BUSCO](https://gitlab.com/ezlab/busco) 5.7.1 + - [KAT](https://github.com/TGAC/KAT) 2.4.1 +- Error rate, QV & phasing + - [meryl](https://github.com/marbl/meryl) and [merqury](https://github.com/marbl/merqury) 1.3 +- Detect assembled telomeres + - [FindTelomeres](https://github.com/JanaSperschneider/FindTelomeres) + - **Biopython** 1.75 +- Repeted elements quantification + - [LTR_retriever](https://github.com/oushujun/LTR_retriever) 3.0.1 + - [LTR_Finder](https://github.com/xzhub/LTR_Finder) latest as of october 2024 +- Contig length exploration + - [QUAST](https://github.com/ablab/quast) 5.2.0 +- Report generation + - **R markdown** 4.0.3 \ No newline at end of file -- GitLab From 8a2c99135982439a443d856ec42d3d9aa8b76d46 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Mon, 13 Jan 2025 16:54:21 +0100 Subject: [PATCH 138/178] Start to implement automatic input conversion --- workflow/scripts/input_conversion.sh | 151 +++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 workflow/scripts/input_conversion.sh diff --git a/workflow/scripts/input_conversion.sh b/workflow/scripts/input_conversion.sh new file mode 100644 index 0000000..8016f14 --- /dev/null +++ b/workflow/scripts/input_conversion.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# Script to convert BAM or FASTQ files to FASTA.gz, and run quality control tools (LongQC or FastQC) with customizable output directories +# Author: Lucien PIAT +# Date: January 13, 2025 + +# Arguments +INPUT_FILE="$1" +OUTPUT_DIR="$2" +QC_OUTPUT_DIR="$3" + +# Check if the output directory is specified, otherwise default to current directory +if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_DIR="." +fi + +# Check if the QC output directory is specified, otherwise default to current directory +if [ -z "$QC_OUTPUT_DIR" ]; then + QC_OUTPUT_DIR="." +fi + +# Create a recap file in the output directory +RECAP_FILE="$OUTPUT_DIR/recap.txt" +echo "Recap of transformations and QC steps" > "$RECAP_FILE" +echo "=====================================" >> "$RECAP_FILE" +echo "Input file: $INPUT_FILE" >> "$RECAP_FILE" +echo "Output directory: $OUTPUT_DIR" >> "$RECAP_FILE" +echo "QC output directory: $QC_OUTPUT_DIR" >> "$RECAP_FILE" +echo "" >> "$RECAP_FILE" + +# Function to convert BAM to FASTA.gz +convert_bam_to_fasta() { + BAM_FILE="$1" + OUTPUT_FILE="$2" + + echo "Converting BAM to FASTA: $BAM_FILE -> $OUTPUT_FILE" + samtools fasta "$BAM_FILE" | gzip > "$OUTPUT_FILE" + if [ $? -eq 0 ]; then + echo "Conversion completed: $OUTPUT_FILE" + echo "Converted BAM to FASTA: $BAM_FILE -> $OUTPUT_FILE" >> "$RECAP_FILE" + else + echo "Error: BAM to FASTA conversion failed!" + exit 1 + fi +} + +# Function to convert FASTQ to FASTA.gz +convert_fastq_to_fasta() { + FASTQ_FILE="$1" + OUTPUT_FILE="$2" + + echo "Converting FASTQ to FASTA: $FASTQ_FILE -> $OUTPUT_FILE" + seqtk seq -a "$FASTQ_FILE" | gzip > "$OUTPUT_FILE" + if [ $? -eq 0 ]; then + echo "Conversion completed: $OUTPUT_FILE" + echo "Converted FASTQ to FASTA: $FASTQ_FILE -> $OUTPUT_FILE" >> "$RECAP_FILE" + else + echo "Error: FASTQ to FASTA conversion failed!" + exit 1 + fi +} + +# Function to zip a FASTA file +zip_fasta() { + FASTA_FILE="$1" + OUTPUT_FILE="$2" + + echo "Zipping FASTA: $FASTA_FILE -> $OUTPUT_FILE" + gzip -c "$FASTA_FILE" > "$OUTPUT_FILE" + if [ $? -eq 0 ]; then + echo "Zipping completed: $OUTPUT_FILE" + echo "Zipped FASTA: $FASTA_FILE -> $OUTPUT_FILE" >> "$RECAP_FILE" + else + echo "Error: FASTA zipping failed!" + exit 1 + fi +} + +# Function to run LongQC on a BAM file +run_longqc() { + BAM_FILE="$1" + QC_OUTPUT="$2" + + echo "Running LongQC on BAM file: $BAM_FILE" + longqc "$BAM_FILE" -o "$QC_OUTPUT" + if [ $? -eq 0 ]; then + echo "LongQC completed successfully. Results saved to $QC_OUTPUT" + echo "LongQC completed on BAM: $BAM_FILE -> $QC_OUTPUT" >> "$RECAP_FILE" + else + echo "Error: LongQC failed!" + exit 1 + fi +} + +# Function to run FastQC on a FASTQ file +run_fastqc() { + FASTQ_FILE="$1" + QC_OUTPUT="$2" + + echo "Running FastQC on FASTQ file: $FASTQ_FILE" + fastqc "$FASTQ_FILE" --outdir="$QC_OUTPUT" + if [ $? -eq 0 ]; then + echo "FastQC completed successfully. Results saved to $QC_OUTPUT" + echo "FastQC completed on FASTQ: $FASTQ_FILE -> $QC_OUTPUT" >> "$RECAP_FILE" + else + echo "Error: FastQC failed!" + exit 1 + fi +} + +# Check file extension and process accordingly +FILE_EXTENSION="${INPUT_FILE##*.}" + +# Ensure the input file exists +if [ ! -f "$INPUT_FILE" ]; then + echo "Error: Input file does not exist!" + exit 1 +fi + +# Handle different file types +case "$FILE_EXTENSION" in + bam) + # If it's a BAM file, convert it to FASTA.gz and run LongQC + OUTPUT_FILE="$OUTPUT_DIR/$(basename "$INPUT_FILE" .bam).fasta.gz" + convert_bam_to_fasta "$INPUT_FILE" "$OUTPUT_FILE" + run_longqc "$INPUT_FILE" "$QC_OUTPUT_DIR" + ;; + fastq) + # If it's a FASTQ file, convert it to FASTA.gz and run FastQC + OUTPUT_FILE="$OUTPUT_DIR/$(basename "$INPUT_FILE" .fastq).fasta.gz" + if [[ "$INPUT_FILE" == *.gz ]]; then + # If the FASTQ file is gzipped, unzip before converting + gunzip -c "$INPUT_FILE" | seqtk seq -a | gzip > "$OUTPUT_FILE" + else + convert_fastq_to_fasta "$INPUT_FILE" "$OUTPUT_FILE" + fi + run_fastqc "$INPUT_FILE" "$QC_OUTPUT_DIR" + ;; + fasta) + # If it's already a FASTA file, just zip it + OUTPUT_FILE="$OUTPUT_DIR/$(basename "$INPUT_FILE" .fasta).fasta.gz" + zip_fasta "$INPUT_FILE" "$OUTPUT_FILE" + ;; + *) + echo "Error: Unsupported file type: $FILE_EXTENSION" + exit 1 + ;; +esac + +echo "Processing completed. Output saved to: $OUTPUT_DIR" +echo "Quality control results saved to: $QC_OUTPUT_DIR" +echo "Recap saved to: $RECAP_FILE" -- GitLab From 5dbb1ed899d3475dce050dd0d44252e8dcf01cf6 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 11:54:07 +0100 Subject: [PATCH 139/178] ingore quast cache --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 11626a9..6ce5669 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,5 @@ busco_downloads results dag.dot .RepeatMaskerCache -slurm_logs \ No newline at end of file +slurm_logs +.quast \ No newline at end of file -- GitLab From f4042c56dbf8e448fdf4ffd5860096f9052477a6 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 11:54:31 +0100 Subject: [PATCH 140/178] add new dag --- doc/dag.svg | 336 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 211 insertions(+), 125 deletions(-) diff --git a/doc/dag.svg b/doc/dag.svg index e176c29..24b683f 100644 --- a/doc/dag.svg +++ b/doc/dag.svg @@ -1,215 +1,301 @@ <?xml version="1.0" standalone="no"?> -<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="676pt" height="404pt" viewBox="0.00 0.00 676.44 404.00"> -<g id="graph0" class="graph" transform="translate(4,400) scale(1)" data-name="snakemake_dag"> +<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="778pt" height="548pt" viewBox="0.00 0.00 778.00 548.00"> +<g id="graph0" class="graph" transform="translate(4,544) scale(1)" data-name="snakemake_dag"> -<polygon fill="white" stroke="none" points="-4,4 -4,-400 672.44,-400 672.44,4 -4,4" style=""/> +<polygon fill="white" stroke="none" points="-4,4 -4,-544 774,-544 774,4 -4,4" style=""/> <!-- 0 --> <g id="node1" class="node" pointer-events="visible" data-name="0"> -<path fill="none" stroke="#d88d56" stroke-width="2" d="M350.51,-36C350.51,-36 320.51,-36 320.51,-36 314.51,-36 308.51,-30 308.51,-24 308.51,-24 308.51,-12 308.51,-12 308.51,-6 314.51,0 320.51,0 320.51,0 350.51,0 350.51,0 356.51,0 362.51,-6 362.51,-12 362.51,-12 362.51,-24 362.51,-24 362.51,-30 356.51,-36 350.51,-36" style=""/> -<text text-anchor="middle" x="335.51" y="-15" font-family="sans" font-size="10.00" style="">all</text> +<path fill="none" stroke="#d88d56" stroke-width="2" d="M253,-36C253,-36 223,-36 223,-36 217,-36 211,-30 211,-24 211,-24 211,-12 211,-12 211,-6 217,0 223,0 223,0 253,0 253,0 259,0 265,-6 265,-12 265,-12 265,-24 265,-24 265,-30 259,-36 253,-36" style=""/> +<text text-anchor="middle" x="238" y="-15" font-family="sans" font-size="10.00" style="">all</text> </g> <!-- 1 --> <g id="node2" class="node" pointer-events="visible" data-name="1"> -<path fill="none" stroke="#b6d856" stroke-width="2" d="M207.67,-252C207.67,-252 139.35,-252 139.35,-252 133.35,-252 127.35,-246 127.35,-240 127.35,-240 127.35,-228 127.35,-228 127.35,-222 133.35,-216 139.35,-216 139.35,-216 207.67,-216 207.67,-216 213.67,-216 219.67,-222 219.67,-228 219.67,-228 219.67,-240 219.67,-240 219.67,-246 213.67,-252 207.67,-252" style=""/> -<text text-anchor="middle" x="173.51" y="-237" font-family="sans" font-size="10.00" style="">haplotigs_handling</text> -<text text-anchor="middle" x="173.51" y="-225" font-family="sans" font-size="10.00" style="">n: 1</text> +<path fill="none" stroke="#d85656" stroke-width="2" d="M432.16,-396C432.16,-396 363.84,-396 363.84,-396 357.84,-396 351.84,-390 351.84,-384 351.84,-384 351.84,-372 351.84,-372 351.84,-366 357.84,-360 363.84,-360 363.84,-360 432.16,-360 432.16,-360 438.16,-360 444.16,-366 444.16,-372 444.16,-372 444.16,-384 444.16,-384 444.16,-390 438.16,-396 432.16,-396" style=""/> +<text text-anchor="middle" x="398" y="-375" font-family="sans" font-size="10.00" style="">haplotigs_handling</text> </g> <!-- 1->0 --> <g id="edge1" class="edge" data-name="1->0"> -<path fill="none" stroke="grey" stroke-width="2" d="M131.29,-215.11C69.2,-186.36 -34.57,-127.44 12.51,-72 48.33,-29.83 216.55,-21.15 295.02,-19.41" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="294.73,-22.91 304.67,-19.23 294.6,-15.91 294.73,-22.91" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M444.92,-374.87C526.99,-368.35 688,-341.93 688,-235 688,-235 688,-235 688,-161 688,-75.57 389.35,-35.09 278.65,-23.02" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="279.13,-19.55 268.82,-21.98 278.39,-26.52 279.13,-19.55" style=""/> </g> <!-- 5 --> <g id="node6" class="node" pointer-events="visible" data-name="5"> -<path fill="none" stroke="#61d856" stroke-width="2" d="M81.05,-108C81.05,-108 33.97,-108 33.97,-108 27.97,-108 21.97,-102 21.97,-96 21.97,-96 21.97,-84 21.97,-84 21.97,-78 27.97,-72 33.97,-72 33.97,-72 81.05,-72 81.05,-72 87.05,-72 93.05,-78 93.05,-84 93.05,-84 93.05,-96 93.05,-96 93.05,-102 87.05,-108 81.05,-108" style=""/> -<text text-anchor="middle" x="57.51" y="-87" font-family="sans" font-size="10.00" style="">cutoffs_graph</text> +<path fill="none" stroke="#56d8a2" stroke-width="2" d="M248.87,-324C248.87,-324 193.13,-324 193.13,-324 187.13,-324 181.13,-318 181.13,-312 181.13,-312 181.13,-300 181.13,-300 181.13,-294 187.13,-288 193.13,-288 193.13,-288 248.87,-288 248.87,-288 254.87,-288 260.87,-294 260.87,-300 260.87,-300 260.87,-312 260.87,-312 260.87,-318 254.87,-324 248.87,-324" style=""/> +<text text-anchor="middle" x="221" y="-303" font-family="sans" font-size="10.00" style="">unpigz_to_fasta</text> </g> <!-- 1->5 --> -<g id="edge13" class="edge" data-name="1->5"> +<g id="edge8" class="edge" data-name="1->5"> -<path fill="none" stroke="grey" stroke-width="2" d="M158.82,-215.02C138.94,-190.69 103.31,-147.07 80.04,-118.58" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="83.03,-116.71 74,-111.18 77.61,-121.14 83.03,-116.71" style=""/> -</g> -<!-- 7 --> -<g id="node8" class="node" pointer-events="visible" data-name="7"> - -<path fill="none" stroke="#56c1d8" stroke-width="2" d="M223.83,-108C223.83,-108 123.19,-108 123.19,-108 117.19,-108 111.19,-102 111.19,-96 111.19,-96 111.19,-84 111.19,-84 111.19,-78 117.19,-72 123.19,-72 123.19,-72 223.83,-72 223.83,-72 229.83,-72 235.83,-78 235.83,-84 235.83,-84 235.83,-96 235.83,-96 235.83,-102 229.83,-108 223.83,-108" style=""/> -<text text-anchor="middle" x="173.51" y="-87" font-family="sans" font-size="10.00" style="">genometools_on_assembly</text> -</g> -<!-- 1->7 --> -<g id="edge15" class="edge" data-name="1->7"> - -<path fill="none" stroke="grey" stroke-width="2" d="M173.51,-215.02C173.51,-191.54 173.51,-150.11 173.51,-121.64" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="177.01,-121.87 173.51,-111.87 170.01,-121.87 177.01,-121.87" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M352.88,-359.15C328.76,-349.62 298.93,-337.82 273.75,-327.86" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="275.2,-324.67 264.62,-324.25 272.63,-331.18 275.2,-324.67" style=""/> </g> <!-- 10 --> <g id="node11" class="node" pointer-events="visible" data-name="10"> -<path fill="none" stroke="#ced856" stroke-width="2" d="M282.38,-180C282.38,-180 226.64,-180 226.64,-180 220.64,-180 214.64,-174 214.64,-168 214.64,-168 214.64,-156 214.64,-156 214.64,-150 220.64,-144 226.64,-144 226.64,-144 282.38,-144 282.38,-144 288.38,-144 294.38,-150 294.38,-156 294.38,-156 294.38,-168 294.38,-168 294.38,-174 288.38,-180 282.38,-180" style=""/> -<text text-anchor="middle" x="254.51" y="-159" font-family="sans" font-size="10.00" style="">unpigz_to_fasta</text> +<path fill="none" stroke="#d6d856" stroke-width="2" d="M548.32,-324C548.32,-324 447.68,-324 447.68,-324 441.68,-324 435.68,-318 435.68,-312 435.68,-312 435.68,-300 435.68,-300 435.68,-294 441.68,-288 447.68,-288 447.68,-288 548.32,-288 548.32,-288 554.32,-288 560.32,-294 560.32,-300 560.32,-300 560.32,-312 560.32,-312 560.32,-318 554.32,-324 548.32,-324" style=""/> +<text text-anchor="middle" x="498" y="-303" font-family="sans" font-size="10.00" style="">genometools_on_assembly</text> </g> <!-- 1->10 --> <g id="edge18" class="edge" data-name="1->10"> -<path fill="none" stroke="grey" stroke-width="2" d="M193.95,-215.34C203.09,-207.44 214.05,-197.96 224.12,-189.26" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="226.23,-192.07 231.5,-182.88 221.65,-186.77 226.23,-192.07" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M423.49,-359.15C435.25,-350.93 449.4,-341.02 462.2,-332.06" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="463.89,-335.15 470.07,-326.55 459.87,-329.42 463.89,-335.15" style=""/> +</g> +<!-- 12 --> +<g id="node13" class="node" pointer-events="visible" data-name="12"> + +<path fill="none" stroke="#56d8c1" stroke-width="2" d="M380,-180C380,-180 350,-180 350,-180 344,-180 338,-174 338,-168 338,-168 338,-156 338,-156 338,-150 344,-144 350,-144 350,-144 380,-144 380,-144 386,-144 392,-150 392,-156 392,-156 392,-168 392,-168 392,-174 386,-180 380,-180" style=""/> +<text text-anchor="middle" x="365" y="-159" font-family="sans" font-size="10.00" style="">kat</text> +</g> +<!-- 1->12 --> +<g id="edge20" class="edge" data-name="1->12"> + +<path fill="none" stroke="grey" stroke-width="2" d="M396.71,-359.01C394.38,-329.12 388.78,-267.43 379,-216 377.56,-208.43 375.63,-200.34 373.66,-192.82" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="377.11,-192.16 371.09,-183.44 370.36,-194.01 377.11,-192.16" style=""/> +</g> +<!-- 16 --> +<g id="node17" class="node" pointer-events="visible" data-name="16"> + +<path fill="none" stroke="#88d856" stroke-width="2" d="M758,-324C758,-324 728,-324 728,-324 722,-324 716,-318 716,-312 716,-312 716,-300 716,-300 716,-294 722,-288 728,-288 728,-288 758,-288 758,-288 764,-288 770,-294 770,-300 770,-300 770,-312 770,-312 770,-318 764,-324 758,-324" style=""/> +<text text-anchor="middle" x="743" y="-303" font-family="sans" font-size="10.00" style="">merqury</text> +</g> +<!-- 1->16 --> +<g id="edge26" class="edge" data-name="1->16"> + +<path fill="none" stroke="grey" stroke-width="2" d="M445.08,-372.06C505.46,-364.98 613,-349.87 702,-324 702.27,-323.92 702.54,-323.84 702.81,-323.76" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="703.89,-327.09 712.2,-320.53 701.61,-320.47 703.89,-327.09" style=""/> +</g> +<!-- 18 --> +<g id="node19" class="node" pointer-events="visible" data-name="18"> + +<path fill="none" stroke="#59d856" stroke-width="2" d="M42,-180C42,-180 12,-180 12,-180 6,-180 0,-174 0,-168 0,-168 0,-156 0,-156 0,-150 6,-144 12,-144 12,-144 42,-144 42,-144 48,-144 54,-150 54,-156 54,-156 54,-168 54,-168 54,-174 48,-180 42,-180" style=""/> +<text text-anchor="middle" x="27" y="-159" font-family="sans" font-size="10.00" style="">quast</text> +</g> +<!-- 1->18 --> +<g id="edge28" class="edge" data-name="1->18"> + +<path fill="none" stroke="grey" stroke-width="2" d="M351.07,-369.1C257.54,-352.23 53.7,-309.51 17,-252 5.91,-234.62 8.99,-211.36 14.54,-192.98" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="17.77,-194.35 17.75,-183.75 11.16,-192.05 17.77,-194.35" style=""/> </g> <!-- 2 --> <g id="node3" class="node" pointer-events="visible" data-name="2"> -<path fill="none" stroke="#56d8d0" stroke-width="2" d="M305.15,-324C305.15,-324 241.87,-324 241.87,-324 235.87,-324 229.87,-318 229.87,-312 229.87,-312 229.87,-300 229.87,-300 229.87,-294 235.87,-288 241.87,-288 241.87,-288 305.15,-288 305.15,-288 311.15,-288 317.15,-294 317.15,-300 317.15,-300 317.15,-312 317.15,-312 317.15,-318 311.15,-324 305.15,-324" style=""/> -<text text-anchor="middle" x="273.51" y="-303" font-family="sans" font-size="10.00" style="">pigz_gfa_to_fasta</text> +<path fill="none" stroke="#d8bc56" stroke-width="2" d="M210.64,-468C210.64,-468 147.36,-468 147.36,-468 141.36,-468 135.36,-462 135.36,-456 135.36,-456 135.36,-444 135.36,-444 135.36,-438 141.36,-432 147.36,-432 147.36,-432 210.64,-432 210.64,-432 216.64,-432 222.64,-438 222.64,-444 222.64,-444 222.64,-456 222.64,-456 222.64,-462 216.64,-468 210.64,-468" style=""/> +<text text-anchor="middle" x="179" y="-447" font-family="sans" font-size="10.00" style="">pigz_gfa_to_fasta</text> </g> <!-- 2->1 --> -<g id="edge10" class="edge" data-name="2->1"> +<g id="edge5" class="edge" data-name="2->1"> -<path fill="none" stroke="grey" stroke-width="2" d="M248.02,-287.15C236.26,-278.93 222.11,-269.02 209.31,-260.06" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="211.63,-257.42 201.43,-254.55 207.62,-263.15 211.63,-257.42" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M223.58,-434.75C256.81,-424.13 302.52,-409.52 338.8,-397.92" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="339.87,-401.26 348.33,-394.88 337.74,-394.59 339.87,-401.26" style=""/> </g> -<!-- 4 --> -<g id="node5" class="node" pointer-events="visible" data-name="4"> +<!-- 2->18 --> +<g id="edge30" class="edge" data-name="2->18"> -<path fill="none" stroke="#b6d856" stroke-width="2" d="M426.67,-252C426.67,-252 358.35,-252 358.35,-252 352.35,-252 346.35,-246 346.35,-240 346.35,-240 346.35,-228 346.35,-228 346.35,-222 352.35,-216 358.35,-216 358.35,-216 426.67,-216 426.67,-216 432.67,-216 438.67,-222 438.67,-228 438.67,-228 438.67,-240 438.67,-240 438.67,-246 432.67,-252 426.67,-252" style=""/> -<text text-anchor="middle" x="392.51" y="-237" font-family="sans" font-size="10.00" style="">haplotigs_handling</text> -<text text-anchor="middle" x="392.51" y="-225" font-family="sans" font-size="10.00" style="">n: 2</text> -</g> -<!-- 2->4 --> -<g id="edge12" class="edge" data-name="2->4"> - -<path fill="none" stroke="grey" stroke-width="2" d="M303.84,-287.15C318.25,-278.68 335.69,-268.42 351.28,-259.25" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="352.77,-262.43 359.62,-254.35 349.22,-256.4 352.77,-262.43" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M153.61,-431.13C112.39,-400.28 32.84,-332.53 6,-252 -0.46,-232.61 4.89,-210.21 11.8,-192.74" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="14.92,-194.33 15.75,-183.76 8.52,-191.51 14.92,-194.33" style=""/> </g> <!-- 3 --> <g id="node4" class="node" pointer-events="visible" data-name="3"> -<path fill="none" stroke="#d8c356" stroke-width="2" d="M295.44,-396C295.44,-396 251.57,-396 251.57,-396 245.57,-396 239.57,-390 239.57,-384 239.57,-384 239.57,-372 239.57,-372 239.57,-366 245.57,-360 251.57,-360 251.57,-360 295.44,-360 295.44,-360 301.44,-360 307.44,-366 307.44,-372 307.44,-372 307.44,-384 307.44,-384 307.44,-390 301.44,-396 295.44,-396" style=""/> -<text text-anchor="middle" x="273.51" y="-381" font-family="sans" font-size="10.00" style="">hifiasm</text> -<text text-anchor="middle" x="273.51" y="-369" font-family="sans" font-size="10.00" style="">sample: run1</text> +<path fill="none" stroke="#d8a456" stroke-width="2" d="M194,-540C194,-540 164,-540 164,-540 158,-540 152,-534 152,-528 152,-528 152,-516 152,-516 152,-510 158,-504 164,-504 164,-504 194,-504 194,-504 200,-504 206,-510 206,-516 206,-516 206,-528 206,-528 206,-534 200,-540 194,-540" style=""/> +<text text-anchor="middle" x="179" y="-519" font-family="sans" font-size="10.00" style="">hifiasm</text> </g> <!-- 3->2 --> -<g id="edge11" class="edge" data-name="3->2"> +<g id="edge6" class="edge" data-name="3->2"> -<path fill="none" stroke="grey" stroke-width="2" d="M273.51,-359.34C273.51,-352.75 273.51,-345.08 273.51,-337.67" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="277.01,-337.93 273.51,-327.93 270.01,-337.93 277.01,-337.93" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M179,-503.34C179,-496.75 179,-489.08 179,-481.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="182.5,-481.93 179,-471.93 175.5,-481.93 182.5,-481.93" style=""/> +</g> +<!-- 4 --> +<g id="node5" class="node" pointer-events="visible" data-name="4"> + +<path fill="none" stroke="#56d8d8" stroke-width="2" d="M71.66,-252C71.66,-252 38.34,-252 38.34,-252 32.34,-252 26.34,-246 26.34,-240 26.34,-240 26.34,-228 26.34,-228 26.34,-222 32.34,-216 38.34,-216 38.34,-216 71.66,-216 71.66,-216 77.66,-216 83.66,-222 83.66,-228 83.66,-228 83.66,-240 83.66,-240 83.66,-246 77.66,-252 71.66,-252" style=""/> +<text text-anchor="middle" x="55" y="-231" font-family="sans" font-size="10.00" style="">scafolding</text> </g> <!-- 4->0 --> <g id="edge2" class="edge" data-name="4->0"> -<path fill="none" stroke="grey" stroke-width="2" d="M372.55,-215.37C363.24,-205.88 353,-193.4 347.51,-180 330.01,-137.34 330.05,-83.03 332.31,-49.65" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="335.79,-49.99 333.12,-39.74 328.81,-49.42 335.79,-49.99" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M70.29,-215.12C102.18,-177.83 175.99,-91.51 214.17,-46.86" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="216.82,-49.15 220.66,-39.28 211.5,-44.6 216.82,-49.15" style=""/> </g> -<!-- 6 --> -<g id="node7" class="node" pointer-events="visible" data-name="6"> +<!-- 4->18 --> +<g id="edge29" class="edge" data-name="4->18"> -<path fill="none" stroke="#61d856" stroke-width="2" d="M416.05,-180C416.05,-180 368.97,-180 368.97,-180 362.97,-180 356.97,-174 356.97,-168 356.97,-168 356.97,-156 356.97,-156 356.97,-150 362.97,-144 368.97,-144 368.97,-144 416.05,-144 416.05,-144 422.05,-144 428.05,-150 428.05,-156 428.05,-156 428.05,-168 428.05,-168 428.05,-174 422.05,-180 416.05,-180" style=""/> -<text text-anchor="middle" x="392.51" y="-159" font-family="sans" font-size="10.00" style="">cutoffs_graph</text> +<path fill="none" stroke="grey" stroke-width="2" d="M47.93,-215.34C45.2,-208.51 42,-200.5 38.93,-192.83" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="42.25,-191.7 35.28,-183.71 35.75,-194.3 42.25,-191.7" style=""/> </g> -<!-- 4->6 --> -<g id="edge14" class="edge" data-name="4->6"> +<!-- 5->4 --> +<g id="edge7" class="edge" data-name="5->4"> -<path fill="none" stroke="grey" stroke-width="2" d="M392.51,-215.34C392.51,-208.75 392.51,-201.08 392.51,-193.67" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="396.01,-193.93 392.51,-183.93 389.01,-193.93 396.01,-193.93" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M180.24,-288.46C157.69,-279.24 128.92,-267.35 96.17,-253.31" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="97.77,-250.19 87.2,-249.46 95.01,-256.62 97.77,-250.19" style=""/> </g> -<!-- 8 --> -<g id="node9" class="node" pointer-events="visible" data-name="8"> +<!-- 11 --> +<g id="node12" class="node" pointer-events="visible" data-name="11"> -<path fill="none" stroke="#56c1d8" stroke-width="2" d="M558.83,-180C558.83,-180 458.19,-180 458.19,-180 452.19,-180 446.19,-174 446.19,-168 446.19,-168 446.19,-156 446.19,-156 446.19,-150 452.19,-144 458.19,-144 458.19,-144 558.83,-144 558.83,-144 564.83,-144 570.83,-150 570.83,-156 570.83,-156 570.83,-168 570.83,-168 570.83,-174 564.83,-180 558.83,-180" style=""/> -<text text-anchor="middle" x="508.51" y="-159" font-family="sans" font-size="10.00" style="">genometools_on_assembly</text> +<path fill="none" stroke="#d86e56" stroke-width="2" d="M264,-252C264,-252 234,-252 234,-252 228,-252 222,-246 222,-240 222,-240 222,-228 222,-228 222,-222 228,-216 234,-216 234,-216 264,-216 264,-216 270,-216 276,-222 276,-228 276,-228 276,-240 276,-240 276,-246 270,-252 264,-252" style=""/> +<text text-anchor="middle" x="249" y="-231" font-family="sans" font-size="10.00" style="">busco</text> </g> -<!-- 4->8 --> -<g id="edge16" class="edge" data-name="4->8"> +<!-- 5->11 --> +<g id="edge19" class="edge" data-name="5->11"> -<path fill="none" stroke="grey" stroke-width="2" d="M422.08,-215.15C435.99,-206.76 452.79,-196.62 467.87,-187.52" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="469.64,-190.54 476.4,-182.38 466.03,-184.55 469.64,-190.54" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M228.07,-287.34C230.8,-280.51 234,-272.5 237.07,-264.83" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="240.25,-266.3 240.72,-255.71 233.75,-263.7 240.25,-266.3" style=""/> </g> -<!-- 12 --> -<g id="node13" class="node" pointer-events="visible" data-name="12"> +<!-- 13 --> +<g id="node14" class="node" pointer-events="visible" data-name="13"> + +<path fill="none" stroke="#5673d8" stroke-width="2" d="M357.98,-252C357.98,-252 306.02,-252 306.02,-252 300.02,-252 294.02,-246 294.02,-240 294.02,-240 294.02,-228 294.02,-228 294.02,-222 300.02,-216 306.02,-216 306.02,-216 357.98,-216 357.98,-216 363.98,-216 369.98,-222 369.98,-228 369.98,-228 369.98,-240 369.98,-240 369.98,-246 363.98,-252 357.98,-252" style=""/> +<text text-anchor="middle" x="332" y="-231" font-family="sans" font-size="10.00" style="">find_telomeres</text> +</g> +<!-- 5->13 --> +<g id="edge22" class="edge" data-name="5->13"> + +<path fill="none" stroke="grey" stroke-width="2" d="M249.3,-287.15C262.61,-278.76 278.69,-268.62 293.12,-259.52" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="294.6,-262.72 301.19,-254.43 290.87,-256.8 294.6,-262.72" style=""/> +</g> +<!-- 14 --> +<g id="node15" class="node" pointer-events="visible" data-name="14"> + +<path fill="none" stroke="#56d88a" stroke-width="2" d="M218.75,-180C218.75,-180 169.25,-180 169.25,-180 163.25,-180 157.25,-174 157.25,-168 157.25,-168 157.25,-156 157.25,-156 157.25,-150 163.25,-144 169.25,-144 169.25,-144 218.75,-144 218.75,-144 224.75,-144 230.75,-150 230.75,-156 230.75,-156 230.75,-168 230.75,-168 230.75,-174 224.75,-180 218.75,-180" style=""/> +<text text-anchor="middle" x="194" y="-159" font-family="sans" font-size="10.00" style="">LTR_retriever</text> +</g> +<!-- 5->14 --> +<g id="edge23" class="edge" data-name="5->14"> -<path fill="none" stroke="#ced856" stroke-width="2" d="M656.38,-180C656.38,-180 600.64,-180 600.64,-180 594.64,-180 588.64,-174 588.64,-168 588.64,-168 588.64,-156 588.64,-156 588.64,-150 594.64,-144 600.64,-144 600.64,-144 656.38,-144 656.38,-144 662.38,-144 668.38,-150 668.38,-156 668.38,-156 668.38,-168 668.38,-168 668.38,-174 662.38,-180 656.38,-180" style=""/> -<text text-anchor="middle" x="628.51" y="-159" font-family="sans" font-size="10.00" style="">unpigz_to_fasta</text> +<path fill="none" stroke="grey" stroke-width="2" d="M217.58,-287.02C213.1,-263.44 205.17,-221.73 199.75,-193.25" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="203.26,-192.98 197.96,-183.81 196.39,-194.29 203.26,-192.98" style=""/> </g> -<!-- 4->12 --> -<g id="edge20" class="edge" data-name="4->12"> +<!-- 15 --> +<g id="node16" class="node" pointer-events="visible" data-name="15"> -<path fill="none" stroke="grey" stroke-width="2" d="M439.61,-220.46C474.74,-210.93 524.18,-197.14 575.33,-181.23" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="576.28,-184.61 584.77,-178.27 574.18,-177.93 576.28,-184.61" style=""/> +<path fill="none" stroke="#568ad8" stroke-width="2" d="M153.76,-252C153.76,-252 114.24,-252 114.24,-252 108.24,-252 102.24,-246 102.24,-240 102.24,-240 102.24,-228 102.24,-228 102.24,-222 108.24,-216 114.24,-216 114.24,-216 153.76,-216 153.76,-216 159.76,-216 165.76,-222 165.76,-228 165.76,-228 165.76,-240 165.76,-240 165.76,-246 159.76,-252 153.76,-252" style=""/> +<text text-anchor="middle" x="134" y="-231" font-family="sans" font-size="10.00" style="">LTR_finder</text> </g> -<!-- 5->0 --> -<g id="edge3" class="edge" data-name="5->0"> +<!-- 5->15 --> +<g id="edge25" class="edge" data-name="5->15"> -<path fill="none" stroke="grey" stroke-width="2" d="M93.71,-74.83C96.67,-73.83 99.63,-72.87 102.51,-72 168.99,-51.98 248.14,-35.49 294.97,-26.46" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="295.43,-29.94 304.6,-24.63 294.12,-23.06 295.43,-29.94" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M199.05,-287.34C189.13,-279.36 177.21,-269.77 166.3,-260.99" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="168.59,-258.34 158.61,-254.8 164.2,-263.8 168.59,-258.34" style=""/> +</g> +<!-- 6 --> +<g id="node7" class="node" pointer-events="visible" data-name="6"> + +<path fill="none" stroke="#56a2d8" stroke-width="2" d="M392.24,-108C392.24,-108 337.76,-108 337.76,-108 331.76,-108 325.76,-102 325.76,-96 325.76,-96 325.76,-84 325.76,-84 325.76,-78 331.76,-72 337.76,-72 337.76,-72 392.24,-72 392.24,-72 398.24,-72 404.24,-78 404.24,-84 404.24,-84 404.24,-96 404.24,-96 404.24,-102 398.24,-108 392.24,-108" style=""/> +<text text-anchor="middle" x="365" y="-87" font-family="sans" font-size="10.00" style="">generate_report</text> </g> <!-- 6->0 --> -<g id="edge4" class="edge" data-name="6->0"> +<g id="edge3" class="edge" data-name="6->0"> -<path fill="none" stroke="grey" stroke-width="2" d="M385.29,-143.02C375.78,-119.33 358.93,-77.35 347.5,-48.86" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="350.77,-47.62 343.8,-39.65 344.27,-50.23 350.77,-47.62" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M332.62,-71.15C315.62,-61.78 294.67,-50.23 276.8,-40.39" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="278.68,-37.43 268.23,-35.66 275.3,-43.56 278.68,-37.43" style=""/> </g> -<!-- 7->0 --> -<g id="edge5" class="edge" data-name="7->0"> +<!-- 7 --> +<g id="node8" class="node" pointer-events="visible" data-name="7"> -<path fill="none" stroke="grey" stroke-width="2" d="M214.81,-71.15C239.67,-60.41 271.16,-46.81 295.74,-36.18" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="296.97,-39.47 304.76,-32.29 294.19,-33.04 296.97,-39.47" style=""/> +<path fill="none" stroke="#a7d856" stroke-width="2" d="M468.43,-180C468.43,-180 421.57,-180 421.57,-180 415.57,-180 409.57,-174 409.57,-168 409.57,-168 409.57,-156 409.57,-156 409.57,-150 415.57,-144 421.57,-144 421.57,-144 468.43,-144 468.43,-144 474.43,-144 480.43,-150 480.43,-156 480.43,-156 480.43,-168 480.43,-168 480.43,-174 474.43,-180 468.43,-180" style=""/> +<text text-anchor="middle" x="445" y="-159" font-family="sans" font-size="10.00" style="">genomescope</text> </g> -<!-- 8->0 --> -<g id="edge6" class="edge" data-name="8->0"> +<!-- 7->6 --> +<g id="edge12" class="edge" data-name="7->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M479.96,-143.14C464.85,-133.31 446.23,-120.58 430.51,-108 406.16,-88.52 380.47,-64.19 361.95,-45.89" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="364.66,-43.65 355.11,-39.07 359.72,-48.61 364.66,-43.65" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M424.81,-143.34C415.79,-135.44 404.96,-125.96 395.01,-117.26" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="397.57,-114.85 387.74,-110.9 392.96,-120.11 397.57,-114.85" style=""/> +</g> +<!-- 8 --> +<g id="node9" class="node" pointer-events="visible" data-name="8"> + +<path fill="none" stroke="#56c1d8" stroke-width="2" d="M468,-252C468,-252 438,-252 438,-252 432,-252 426,-246 426,-240 426,-240 426,-228 426,-228 426,-222 432,-216 438,-216 438,-216 468,-216 468,-216 474,-216 480,-222 480,-228 480,-228 480,-240 480,-240 480,-246 474,-252 468,-252" style=""/> +<text text-anchor="middle" x="453" y="-231" font-family="sans" font-size="10.00" style="">jellyfish</text> +</g> +<!-- 8->7 --> +<g id="edge17" class="edge" data-name="8->7"> + +<path fill="none" stroke="grey" stroke-width="2" d="M450.98,-215.34C450.23,-208.75 449.35,-201.08 448.5,-193.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="452,-193.45 447.39,-183.91 445.05,-194.24 452,-193.45" style=""/> +</g> +<!-- 8->12 --> +<g id="edge21" class="edge" data-name="8->12"> + +<path fill="none" stroke="grey" stroke-width="2" d="M430.79,-215.34C420.76,-207.36 408.71,-197.77 397.67,-188.99" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="399.88,-186.27 389.87,-182.79 395.52,-191.75 399.88,-186.27" style=""/> </g> <!-- 9 --> <g id="node10" class="node" pointer-events="visible" data-name="9"> -<path fill="none" stroke="#56d8b9" stroke-width="2" d="M295.51,-108C295.51,-108 265.51,-108 265.51,-108 259.51,-108 253.51,-102 253.51,-96 253.51,-96 253.51,-84 253.51,-84 253.51,-78 259.51,-72 265.51,-72 265.51,-72 295.51,-72 295.51,-72 301.51,-72 307.51,-78 307.51,-84 307.51,-84 307.51,-96 307.51,-96 307.51,-102 301.51,-108 295.51,-108" style=""/> -<text text-anchor="middle" x="280.51" y="-87" font-family="sans" font-size="10.00" style="">busco</text> +<path fill="none" stroke="#70d856" stroke-width="2" d="M647.75,-180C647.75,-180 548.25,-180 548.25,-180 542.25,-180 536.25,-174 536.25,-168 536.25,-168 536.25,-156 536.25,-156 536.25,-150 542.25,-144 548.25,-144 548.25,-144 647.75,-144 647.75,-144 653.75,-144 659.75,-150 659.75,-156 659.75,-156 659.75,-168 659.75,-168 659.75,-174 653.75,-180 647.75,-180" style=""/> +<text text-anchor="middle" x="598" y="-159" font-family="sans" font-size="10.00" style="">genometools_on_raw_data</text> </g> -<!-- 9->0 --> -<g id="edge7" class="edge" data-name="9->0"> +<!-- 9->6 --> +<g id="edge11" class="edge" data-name="9->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M294.39,-71.34C300.2,-63.93 307.11,-55.14 313.59,-46.9" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="316.15,-49.31 319.57,-39.28 310.64,-44.98 316.15,-49.31" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M538.3,-143.06C500.91,-131.83 453.19,-117.49 417.29,-106.71" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="418.6,-103.45 408.02,-103.92 416.59,-110.15 418.6,-103.45" style=""/> </g> -<!-- 10->9 --> -<g id="edge17" class="edge" data-name="10->9"> +<!-- 10->6 --> +<g id="edge13" class="edge" data-name="10->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M261.07,-143.34C263.57,-136.59 266.51,-128.69 269.32,-121.11" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="272.6,-122.33 272.8,-111.74 266.04,-119.9 272.6,-122.33" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M502.6,-287.1C509.7,-255.14 519.53,-187.56 489,-144 472.54,-120.52 443.46,-107.38 417.59,-100.05" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="418.71,-96.73 408.15,-97.65 416.98,-103.51 418.71,-96.73" style=""/> </g> -<!-- 11 --> -<g id="node12" class="node" pointer-events="visible" data-name="11"> +<!-- 11->6 --> +<g id="edge9" class="edge" data-name="11->6"> -<path fill="none" stroke="#56d8b9" stroke-width="2" d="M481.51,-108C481.51,-108 451.51,-108 451.51,-108 445.51,-108 439.51,-102 439.51,-96 439.51,-96 439.51,-84 439.51,-84 439.51,-78 445.51,-72 451.51,-72 451.51,-72 481.51,-72 481.51,-72 487.51,-72 493.51,-78 493.51,-84 493.51,-84 493.51,-96 493.51,-96 493.51,-102 487.51,-108 481.51,-108" style=""/> -<text text-anchor="middle" x="466.51" y="-87" font-family="sans" font-size="10.00" style="">busco</text> +<path fill="none" stroke="grey" stroke-width="2" d="M263.28,-215.22C277.76,-197.21 300.84,-168.59 321,-144 327.82,-135.68 335.29,-126.65 342.1,-118.45" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="344.64,-120.87 348.34,-110.95 339.25,-116.4 344.64,-120.87" style=""/> </g> -<!-- 11->0 --> -<g id="edge8" class="edge" data-name="11->0"> +<!-- 12->6 --> +<g id="edge16" class="edge" data-name="12->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M438.73,-74.15C420.06,-64.18 395.18,-50.89 374.53,-39.85" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="376.39,-36.88 365.92,-35.25 373.09,-43.05 376.39,-36.88" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M365,-143.34C365,-136.75 365,-129.08 365,-121.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="368.5,-121.93 365,-111.93 361.5,-121.93 368.5,-121.93" style=""/> </g> -<!-- 12->11 --> -<g id="edge19" class="edge" data-name="12->11"> +<!-- 13->6 --> +<g id="edge15" class="edge" data-name="13->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M587.74,-144.37C565.69,-135.26 537.74,-123.54 506.16,-109.55" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="507.69,-106.4 497.13,-105.53 504.84,-112.8 507.69,-106.4" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M328.04,-215.16C324.76,-196.86 321.68,-167.79 329,-144 331.68,-135.29 336.33,-126.72 341.42,-119.1" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="344.16,-121.29 347.23,-111.15 338.51,-117.16 344.16,-121.29" style=""/> </g> -<!-- 13 --> -<g id="node14" class="node" pointer-events="visible" data-name="13"> +<!-- 14->6 --> +<g id="edge14" class="edge" data-name="14->6"> + +<path fill="none" stroke="grey" stroke-width="2" d="M231.53,-145.64C255.38,-135.88 286.59,-123.1 312.83,-112.36" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="314.15,-115.6 322.08,-108.57 311.5,-109.12 314.15,-115.6" style=""/> +</g> +<!-- 15->14 --> +<g id="edge24" class="edge" data-name="15->14"> + +<path fill="none" stroke="grey" stroke-width="2" d="M149.14,-215.34C155.56,-207.85 163.19,-198.95 170.32,-190.63" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="172.84,-193.07 176.69,-183.2 167.52,-188.51 172.84,-193.07" style=""/> +</g> +<!-- 16->6 --> +<g id="edge10" class="edge" data-name="16->6"> + +<path fill="none" stroke="grey" stroke-width="2" d="M740.31,-287.13C734.21,-253.34 716.04,-180.23 669,-144 630.53,-114.37 494.13,-100.13 417.76,-94.34" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="418.39,-90.88 408.16,-93.64 417.88,-97.86 418.39,-90.88" style=""/> +</g> +<!-- 17 --> +<g id="node18" class="node" pointer-events="visible" data-name="17"> + +<path fill="none" stroke="#56d873" stroke-width="2" d="M758,-396C758,-396 728,-396 728,-396 722,-396 716,-390 716,-384 716,-384 716,-372 716,-372 716,-366 722,-360 728,-360 728,-360 758,-360 758,-360 764,-360 770,-366 770,-372 770,-372 770,-384 770,-384 770,-390 764,-396 758,-396" style=""/> +<text text-anchor="middle" x="743" y="-375" font-family="sans" font-size="10.00" style="">meryl</text> +</g> +<!-- 17->16 --> +<g id="edge27" class="edge" data-name="17->16"> -<path fill="none" stroke="#d8ac56" stroke-width="2" d="M623.26,-108C623.26,-108 523.76,-108 523.76,-108 517.76,-108 511.76,-102 511.76,-96 511.76,-96 511.76,-84 511.76,-84 511.76,-78 517.76,-72 523.76,-72 523.76,-72 623.26,-72 623.26,-72 629.26,-72 635.26,-78 635.26,-84 635.26,-84 635.26,-96 635.26,-96 635.26,-102 629.26,-108 623.26,-108" style=""/> -<text text-anchor="middle" x="573.51" y="-93" font-family="sans" font-size="10.00" style="">genometools_on_raw_data</text> -<text text-anchor="middle" x="573.51" y="-81" font-family="sans" font-size="10.00" style="">sample: run1</text> +<path fill="none" stroke="grey" stroke-width="2" d="M743,-359.34C743,-352.75 743,-345.08 743,-337.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="746.5,-337.93 743,-327.93 739.5,-337.93 746.5,-337.93" style=""/> </g> -<!-- 13->0 --> -<g id="edge9" class="edge" data-name="13->0"> +<!-- 18->0 --> +<g id="edge4" class="edge" data-name="18->0"> -<path fill="none" stroke="grey" stroke-width="2" d="M512.53,-71.06C469.53,-58.42 413.15,-41.84 375.7,-30.82" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="376.78,-27.49 366.2,-28.03 374.8,-34.21 376.78,-27.49" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M30.9,-143.01C36.17,-122.91 47.49,-90.75 69,-72 105.13,-40.51 159.91,-27.72 197.24,-22.53" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="197.63,-26.01 207.13,-21.31 196.78,-19.06 197.63,-26.01" style=""/> </g> </g> </svg> \ No newline at end of file -- GitLab From 84848fe01f93161e4b55813855c7434117389955 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 11:55:09 +0100 Subject: [PATCH 141/178] Clarify the requirements --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 654091e..e899790 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,7 @@ This workflow uses [Snakemake](https://snakemake.readthedocs.io/en/stable/) to q ``` ## Requirement -Miniforge, Singularity/Apptainer, Snakemake - - +Miniforge (Snakemake), Singularity/Apptainer ## How to Use ### 1. Set up Clone the Git repository @@ -37,6 +35,7 @@ git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git && cd GenomAsm4pg - Edit the `masterconfig` file in the `.config/` directory with your sample information. ### 3. Run the workflow + #### <ins>A. On a HPC</ins> - Edit `job.sh` with your email and add path to the needed modules (`Singularity/Apptainer`, `Miniforge`) - Provide the environement you created in `job.sh`, under `source activate wf_env`, you can create it like this : -- GitLab From d3984196e8c318250df1cde4c1506bffa6a63a2f Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 11:55:42 +0100 Subject: [PATCH 142/178] Add options and outputs sections --- doc/documentation.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/documentation.md b/doc/documentation.md index 72016f5..4595f8a 100644 --- a/doc/documentation.md +++ b/doc/documentation.md @@ -4,12 +4,11 @@ Asm4pg is an automatic and reproducible genome assembly workflow for pangenomic doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/genomasm4pg) -## Tutorials -### Runing in Hi-C mode - -### All options +## All options +Asm4pg has many options with default values, if you wish to modify them, refer to the [Going-Further](doc/going_further.md) section. ## Outputs +If you want to now more about the outputed files of the workflow, refer to the [Outputs](doc/outputs.md) section. ## Known errors You may run into [these errors](doc/known_errors.md) -- GitLab From 47cb5672afad911049d5f42a456297644373f41b Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 11:56:28 +0100 Subject: [PATCH 143/178] Rewrite the file for new workflow --- doc/Going-further.md | 64 --------------------------- doc/going_further.md | 101 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 64 deletions(-) delete mode 100644 doc/Going-further.md create mode 100644 doc/going_further.md diff --git a/doc/Going-further.md b/doc/Going-further.md deleted file mode 100644 index 590e69f..0000000 --- a/doc/Going-further.md +++ /dev/null @@ -1,64 +0,0 @@ -# Going further -## 01. In-depth options -### Job.sh/local_run.sh options -For a dry run -```bash -sbatch job.sh dry -``` -If tou want to output the DAG of the workflow -```bash -sbatch job.sh dag -``` -To run the workflow -```bash -sbatch job.sh run -``` - -## Workflow options -Inside the ./.config/marsterconfig.yaml file you can add more options -```yaml -IDS: ["example1"] - -example1: - fasta: ./GenomAsm4pg/tutorial_data.fasta - run: example_run - ploidy: 2 - busco_lineage: eudicots_odb10 - assembly_purge_force: 3 - run_purge_dups : True - mode: default -``` - -- `fasta` : Your reads -- `run` : The run name -- `ploidy` : The ploidy of the organims -- `busco_lineage` : The busco lineage of your organisms listed [here](https://busco.ezlab.org/list_of_lineages.html) -- `assembly_purge_force` : [1-3] the purge level of Hifiasm `-l` parametter, full description [here](https://hifiasm.readthedocs.io/en/latest/parameter-reference.html) default is set to 3 -- `run_purge_dups` : [True, False] If set to true, the workflow will run [purge_dups](https://github.com/dfguan/purge_dups) on the assembly and rerun all the metrics. Default is set to False. Note that truning on this option will more tan double the runing time of the workflow. -- `mode`: [default, hi-c, trio] See [Hi-C assembly mode tutorial](Assembly-Mode/Hi-C-tutorial.md) or the [Trio assembly mode tutorial](Assembly-Mode/Trio-tutorial.md) - - -## 2. Run the workflow on multiple datasets -You can run the workflow on multiple datasets at the same time. - -```yaml -IDS: ["toy_dataset", "purge_dataset", "toy_dataset_hi-c", "toy_dataset_trio"] - -toy_dataset: - ... - -toy_dataset_hi-c: - ... - -toy_dataset_trio: - ... -``` - -You can remove dataset from IDS to assemble only chosen genomes: -```yaml -IDS: ["toy_dataset", "toy_dataset_trio"] -``` -Running the workflow with this config will assemble only `toy_dataset` and `toy_dataset_trio`. - -## 4. Add a reference genome -You can add a reference genome to the `.masterconfig` and set `scafold_output` to True to run ragtag on your output. \ No newline at end of file diff --git a/doc/going_further.md b/doc/going_further.md new file mode 100644 index 0000000..6a7705d --- /dev/null +++ b/doc/going_further.md @@ -0,0 +1,101 @@ +# Going further + +## 01. Job.sh/local_run.sh options +Usage: job.sh/local_run.sh [dry|run|dag|rulegraph|unlock] +- [dry] - run the specified Snakefile in dry-run mode +- [run] - run the specified Snakefile normally +- [dag] - generate the directed acyclic graph for the specified Snakefile +- [rulegraph] - generate the rulegraph for the specified Snakefile +- [unlock] - Unlock the directory if snakemake crashed + +## 02. Workflow options +Inside the ./.config/marsterconfig.yaml file you can add more options + +Here are all the options and their default values : +- `fasta_gz` : Your reads (mandatory) +- `mode`: [default, hi-c, trio] The mode for hifiasm assembly (default: default) + - `r1` if hi-c of trio mode the run1/parent1 read file + - `r2` if hi-c of trio mode the run2/parent2 read file +- `run_purge_dups` : [True, False] If set to true, the workflow will run [purge_dups](https://github.com/dfguan/purge_dups) on the assembly. (default: False) +- `busco_lineage` : The busco lineage of your organisms listed [here](https://busco.ezlab.org/list_of_lineages.html) (default: eukaryota_odb10) +- `ploidy` : The ploidy of the organims (default: 2) +- `run_ragtag` : [True, False] If set to true, the workflow will run RagTag and produce a scafold of the assemblies (default: False) + - `reference_genome` : The reference genome used for Quast and RagTag scafolding + + +/!\ Advanced options, use only if you have read the docs of the tools, we strogly advise keeping default values : +- `assembly_purge_force` : [1-3] the purge level of Hifiasm `-l` parametter, full description [here](https://hifiasm.readthedocs.io/en/latest/parameter-reference.html) (default: 3) +- `kmer_size` : The sizes of the kmers used for QC steps (default: 21) + +## 03. Example configurations +### Minimal config +```yaml +samples: + example1: + fasta_gz: example.fasta.gz +``` + +### Simple config +```yaml +samples: + example1: + fasta_gz: example.fasta.gz + busco_lineage: eudicots_odb10 + run_purge_dups: True + run_ragtag: True + reference_genome: ref.fasta.gz + +``` +### Hi-c config +```yaml +samples: + example1: + fasta_gz: example.fasta.gz + mode: hi-c + r1: run1.fasta.gz + r2: run2.fasta.gz + +``` + +### Trio config +```yaml +samples: + example1: + fasta_gz: example.fasta.gz + mode: trio + r1: parent1.fasta.gz + r2: parent2.fasta.gz +``` + +### Adanced config +```yaml +samples: + example1: + fasta_gz: example.fasta.gz + mode: hi-c + r1: run1.fasta.gz + r2: run2.fasta.gz + run_purge_dups: True + assembly_purge_force: 2 + ploidy: 2 + kmer_size: 21 + busco_lineage: eudicots_odb10 + run_ragtag: True + reference_genome: ref.fasta.gz +``` + +## 04. Run the workflow on multiple datasets +You can run the workflow on multiple datasets at the same time. + +```yaml +samples: + dataset_1: + fasta_gz: example_1.fasta.gz + run_purge_dups: True + dataset_2: + fasta_gz: example_2.fasta.gz + run_purge_dups: False + dataset_n: + fasta_gz: example_n.fasta.gz +``` + -- GitLab From 8aca5cca18d3bb28fc0ecb747104ef749d11b66c Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 11:56:47 +0100 Subject: [PATCH 144/178] Adress hpc problems --- doc/known_errors.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/known_errors.md b/doc/known_errors.md index 6e37f94..856fe2b 100644 --- a/doc/known_errors.md +++ b/doc/known_errors.md @@ -7,6 +7,13 @@ The first time you run the workflow, the BUSCO lineage might be downloaded multi When you try to rerun the workflow after cancelling a job, you may have to unlock the results directory. To do so, go in `job.sh/local_run.sh` and uncomment `#--unlock`. Run the workflow once to unlock the directory (it should only take a few seconds). Still in `job.sh/local_run.sh`, re add the `#`. The workflow will be able to run and create outputs. ## HPC problems -The workflow does not work with HPC that does not allow a job to run other jobs. You can still run the workflow with a dynamic session and ./local_run +The asm4pg.sh does not work with HPC that does not allow a job to run other jobs. -If the version of SLRUM in the HPC is old, you may run into this error `srun: unrecognized option '--cpu-bind=q'` this is a known SLURM/Snakemake issue and SLRUM needs to be updated (https://github.com/snakemake/snakemake/issues/2071) \ No newline at end of file +If the version of SLRUM in the HPC is old, you may run into this error `srun: unrecognized option '--cpu-bind=q'` this is a known SLURM/Snakemake issue and SLRUM needs to be updated (https://github.com/snakemake/snakemake/issues/2071) + +A temporary sollution is to run the ./local_run.sh with sbatch : +``` +module load Singularity +source activate wf_env +sbatch ./local_run.sh dry +``` \ No newline at end of file -- GitLab From 049e8b378e05ae411517221176d52c2b66137f28 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 13:46:11 +0100 Subject: [PATCH 145/178] fix quast command --- workflow/scripts/quast_call.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/quast_call.sh b/workflow/scripts/quast_call.sh index bda9c92..e7a7b87 100755 --- a/workflow/scripts/quast_call.sh +++ b/workflow/scripts/quast_call.sh @@ -33,7 +33,7 @@ fi # Build the quast command echo "Asm4pg -> Building the QUAST command..." -quast_cmd="quast " +quast_cmd="python /quast-5.2.0/metaquast.py --threads 20 --large " if [ "$REFERENCE_GENOME" != "None" ]; then echo " - Reference genome specified: $REFERENCE_GENOME" quast_cmd+="--reference $REFERENCE_GENOME " @@ -49,6 +49,12 @@ echo "$quast_cmd" echo "Asm4pg -> Running QUAST..." eval $quast_cmd +echo "Asm4pg -> Isolating QUAST output" +cp $OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.pdf $OUTPUT_DIR cumulative_plot.pdf +cp $OUTPUT_DIR/combined_reference/basic_stats/CG_content_plot.pdf $OUTPUT_DIR CG_content_plot.pdf +cp $OUTPUT_DIR/combined_reference/basic_stats/Nx_plot.pdf $OUTPUT_DIR Nx_plot.pdf + + # Exit status check if [ $? -eq 0 ]; then echo "Asm4pg -> QUAST completed successfully." -- GitLab From 5e672128b1cd36b257d5433fb1571fe53a481294 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 13:46:43 +0100 Subject: [PATCH 146/178] Added unlock and rulegraph options --- job.sh | 35 ++++++++++++++++++++++------------- local_run.sh | 27 ++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/job.sh b/job.sh index 17ee3a0..e456326 100644 --- a/job.sh +++ b/job.sh @@ -7,14 +7,19 @@ #SBATCH --mem=10G # Verify arguments -if [ $# -ne 1 ]; then - echo "Usage: $0 [dry|dag|run]" +if [ $# -ne 1 ] || [ "$1" == "help" ]; then + echo "Use this script to run asm4pg localy or on a single HPC node" + echo "" + echo "Usage: $0 [dry|run|dag|rulegraph|unlock]" echo " dry - run the specified Snakefile in dry-run mode" - echo " dag - generate DAG for the specified Snakefile" echo " run - run the specified Snakefile normally" + echo " dag - generate the directed acyclic graph for the specified Snakefile" + echo " rulegraph - generate the rulegraph for the specified Snakefile" + echo " unlock - Unlock the directory if snakemake crashed" exit 1 fi + # Update this with the path to your images echo 'Loading modules' module purge @@ -42,23 +47,27 @@ run_snakemake() { exit 1 fi ;; + rulegraph) + snakemake -c $(nproc) --rulegraph > rulegraph.dot + if [ $? -eq 0 ]; then + echo "Asm4pg -> Rulegraph has been successfully generated as rulegraph.dot" + else + echo "Asm4pg -> Error: Failed to generate Rulegraph." + exit 1 + fi + ;; + unlock) + snakemake --workflow-profile ./.config/snakemake/profiles/slurm --unlock + ;; run) - snakemake --workflow-profile ./.config/snakemake/profiles/slurm #--unlock + snakemake --workflow-profile ./.config/snakemake/profiles/slurm ;; *) echo "Invalid option: $option" - echo "Usage: $0 [dry|dag|run]" + echo "Usage: $0 [dry|run|dag|rulegraph|unlock]" exit 1 ;; esac - - # Check if the Snakemake command was successful - if [ $? -eq 0 ]; then - echo "Asm4pg -> Snakemake workflow completed successfully." - else - echo "Asm4pg -> Error: Snakemake workflow execution failed." - exit 1 - fi } # Execute the function with the provided option diff --git a/local_run.sh b/local_run.sh index 05d0783..f46794f 100755 --- a/local_run.sh +++ b/local_run.sh @@ -9,6 +9,7 @@ # Written by Lucien Piat at INRAe # 07/01/25 +# Use this script to run asm4pg localy or on a single HPC node SNG_BIND=$(pwd) @@ -20,7 +21,7 @@ run_snakemake() { snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) -n ;; dag) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) --rulegraph > dag.dot + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) --dag > dag.dot if [ $? -eq 0 ]; then echo "Asm4pg -> DAG has been successfully generated as dag.dot" else @@ -28,12 +29,24 @@ run_snakemake() { exit 1 fi ;; + rulegraph) + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) --rulegraph > rulegraph.dot + if [ $? -eq 0 ]; then + echo "Asm4pg -> Rulegraph has been successfully generated as rulegraph.dot" + else + echo "Asm4pg -> Error: Failed to generate Rulegraph." + exit 1 + fi + ;; + unlock) + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) --unlock + ;; run) snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) #--unlock ;; *) echo "Invalid option: $option" - echo "Usage: $0 [dry|dag|run]" + echo "Usage: $0 [dry|run|dag|rulegraph|unlock]" exit 1 ;; esac @@ -48,11 +61,15 @@ run_snakemake() { } # Verify arguments -if [ $# -ne 1 ]; then - echo "Usage: $0 [dry|dag|run]" +if [ $# -ne 1 ] || [ "$1" == "help" ]; then + echo "Use this script to run asm4pg localy or on a single HPC node" + echo "" + echo "Usage: $0 [dry|run|dag|rulegraph|unlock]" echo " dry - run the specified Snakefile in dry-run mode" - echo " dag - generate DAG for the specified Snakefile" echo " run - run the specified Snakefile normally" + echo " dag - generate the directed acyclic graph for the specified Snakefile" + echo " rulegraph - generate the rulegraph for the specified Snakefile" + echo " unlock - Unlock the directory if snakemake crashed" exit 1 fi -- GitLab From ccf58cc270cbf058625839e25380abde4d632af9 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 13:47:09 +0100 Subject: [PATCH 147/178] fixed quast command --- workflow/Snakefile | 126 +++++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index cf96253..f656192 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,7 +16,7 @@ assembly_qc_folder = os.path.join(output_dir, "{sample}_results", "04_assembly_q rule all: input: - # Required final assemblies and graphs + # Required final assemblies and report expand( os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}", "{sample}_final_hap{n}.fasta.gz"), sample=config["samples"].keys(), n=[1, 2] @@ -85,7 +85,6 @@ rule pigz_gfa_to_fasta: awk {TO_FA_CMD:q} {input.gfa} | pigz -p {threads} > {output.fasta} """ - # Potentialy purge the haplotigs using purge_dups rule haplotigs_handling: input: @@ -166,7 +165,7 @@ use rule genometools_on_raw_data as genometools_on_assembly with: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt") priority: 0 -# BUSCO stats on assembly (may not work on first run, rerun the WF) +# BUSCO stats on assembly rule busco: input: rules.unpigz_to_fasta.output @@ -210,7 +209,7 @@ rule jellyfish: input: reads = lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: - jf = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "jellyfish", "{sample}.jf"), + jf = temp(os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "jellyfish", "{sample}.jf")), histo = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "jellyfish", "{sample}.histo") params: km_size = get_kmer_size @@ -274,7 +273,7 @@ rule meryl: input: lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: - directory(os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "meryl", "{sample}_reads-db.meryl")) + temp(directory(os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "meryl", "{sample}_reads-db.meryl"))) params: km_size = get_kmer_size threads: 20 @@ -311,11 +310,11 @@ rule merqury: cp {input.hap2} {params.out_dir}/tmp_hap2.fasta.gz # Run Merqury - cd {params.out_dir} && \ - export MERQURY=/usr/local/share/merqury && \ - merqury.sh {input.km_database} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap2.fasta.gz {params.prefix} && - rm {params.out_dir}/tmp_hap* && - rm {params.out_dir}/*reads-db* + cd {params.out_dir} + export MERQURY=/usr/local/share/merqury + merqury.sh {input.km_database} {params.out_dir}/tmp_hap1.fasta.gz {params.out_dir}/tmp_hap2.fasta.gz {params.prefix} + rm tmp_hap1.fasta.gz + rm tmp_hap2.fasta.gz """ # Identifies LTR retrotransposons @@ -332,7 +331,6 @@ rule LTR_finder: shell: "ltr_finder -C {input} > {output}" -# NOT TESTED # Calculates the LTR Assembly Index (LAI), a metric for assembly quality based on LTR retrotransposons rule LTR_retriever: input: @@ -363,54 +361,6 @@ rule LTR_retriever: rm tmp* ''' -# NOT TESTED -# Rule to generate the html report -rule generate_report: - input: - genomescope = rules.genomescope.output.plot, - genomescope_sum = rules.genomescope.output.summary, - genometools_on_raw_data = rules.genometools_on_raw_data.output, - - genometools_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), - genometools_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), - - busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt"), - busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt"), - - kplot_hap1 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), - kplot_hap2 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), - - telomeres_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "telomeres", "{sample}_hap1_telomeres.txt"), - telomeres_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "telomeres", "{sample}_hap2_telomeres.txt"), - - LAI_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "{sample}_hap1.out.LAI"), - LAI_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "{sample}_hap2.out.LAI"), - - LRT_recap_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "recap_{sample}_hap1.tbl"), - LRT_recap_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "recap_{sample}_hap2.tbl"), - - merqury_stats=rules.merqury.output.stats, - merqury_qv=rules.merqury.output.qv - output: - os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html") - params: - sample = "{sample}", - mode = get_mode, - assembly_purge_force = get_purge_force, - run_purge_dups = get_purge_bool, - busco_lineage = get_busco_lin, - ploidy = get_ploidy, - kmer_size = get_kmer_size, - r1 = lambda wildcards: get_run(wildcards, run=1), - r2 = lambda wildcards: get_run(wildcards, run=2) - resources: - mem_mb=10000, - time="10:00:00" - container: - f"{container_registry}/rmarkdown4.0.3" - script: - "../scripts/report.Rmd" - # NOT TESTED # Rule to create scafold on assemblies rule scafolding: @@ -454,9 +404,61 @@ rule quast: ragtag_bool=get_ragtag_bool container: f"{container_registry}/staphb/quast:5.2.0" + threads: 20 + resources: + mem_mb=100000, + time="10:00:00" shell: """ - bash ./workflow/scripts/quast_call.sh {params.reference_genome} {params.purge_bool} {params.ragtag_bool} \ + ./workflow/scripts/quast_call.sh {params.reference_genome} {params.purge_bool} {params.ragtag_bool} \ {input.raw_hap1} {input.raw_hap2} {input.final_hap1} {input.final_hap2} \ - {params.ragtag_hap1} {params.ragtag_hap2} {output.quast_output} - """ \ No newline at end of file + {params.ragtag_hap1} {params.ragtag_hap2} {output.quast_output} + """ + +# NOT TESTED +# Rule to generate the html report +rule generate_report: + input: + genomescope = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), + genomescope_sum = rules.genomescope.output.summary, + genometools_on_raw_data = rules.genometools_on_raw_data.output, + + genometools_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), + genometools_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), + + busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt"), + busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt"), + + kplot_hap1 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), + kplot_hap2 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), + + telomeres_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "telomeres", "{sample}_hap1_telomeres.txt"), + telomeres_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "telomeres", "{sample}_hap2_telomeres.txt"), + + LAI_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "{sample}_hap1.out.LAI"), + LAI_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "{sample}_hap2.out.LAI"), + + LRT_recap_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "recap_{sample}_hap1.tbl"), + LRT_recap_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "recap_{sample}_hap2.tbl"), + + merqury_stats=rules.merqury.output.stats, + merqury_qv=rules.merqury.output.qv + output: + os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html") + params: + sample = "{sample}", + mode = get_mode, + assembly_purge_force = get_purge_force, + run_purge_dups = get_purge_bool, + busco_lineage = get_busco_lin, + ploidy = get_ploidy, + kmer_size = get_kmer_size, + r1 = lambda wildcards: get_run(wildcards, run=1), + r2 = lambda wildcards: get_run(wildcards, run=2) + resources: + mem_mb=10000, + time="10:00:00" + container: + f"{container_registry}/rmarkdown4.0.3" + script: + "./workflow/scripts/report.Rmd" \ No newline at end of file -- GitLab From caf56c17518b976fe160caf1d318b9d6dccbeab8 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 16:51:38 +0100 Subject: [PATCH 148/178] Update documentation with uniform style --- doc/Assembly-Mode/Hi-C-tutorial.md | 46 --- doc/Assembly-Mode/Trio-tutorial.md | 47 --- doc/documentation.md | 22 +- doc/fig/rule_dag.svg | 493 ----------------------------- doc/going_further.md | 91 +++--- doc/known_errors.md | 23 +- doc/outputs.md | 96 +++--- doc/software_list.md | 73 +++-- 8 files changed, 170 insertions(+), 721 deletions(-) delete mode 100644 doc/Assembly-Mode/Hi-C-tutorial.md delete mode 100644 doc/Assembly-Mode/Trio-tutorial.md delete mode 100644 doc/fig/rule_dag.svg diff --git a/doc/Assembly-Mode/Hi-C-tutorial.md b/doc/Assembly-Mode/Hi-C-tutorial.md deleted file mode 100644 index 95ac825..0000000 --- a/doc/Assembly-Mode/Hi-C-tutorial.md +++ /dev/null @@ -1,46 +0,0 @@ -# Hi-C mode tutorial - -Please look at [quick start](../Quick-start.md) first, some of the steps are omitted here. - -This tutorial shows how to use the workflow with hi-c assembly mode which takes PacBio Hifi data and Hi-C data as input. - -## 1. Config file -**TO-DO : add a toy dataset fasta and hi-c.** -```bash -cd GenomAsm4pg/.config -``` - -Modify `masterconfig.yaml`. The PacBio HiFi file is `toy_dataset_hi-c.fasta`, its name is used as key in config. The Hi-C files are `data_r1.fasta` and `data_r2.fasta` - -```yaml -####################### job - workflow ####################### -### CONFIG - -IDS: ["toy_dataset_hi-c"] - -toy_dataset_hi-c: - fasta: ./GenomAsm4pg/tutorial_data/hi-c/toy_dataset_hi-c.fasta - run: hi-c_tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: hi-c - r1: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta - r2: ./GenomAsm4pg/tutorial_data/hi-c/data_r1.fasta -``` - -## 2. Dry run -To check the config, first do a dry run of the workflow. - -```bash -sbatch job.sh dry -``` -## 3. Run -If the dry run is successful, you can run the workflow. - -```bash -sbatch job.sh -``` - -## Other assembly modes -If you want to use parental data, follow the [Trio assembly mode tutorial](Trio-tutorial.md). -To go further with the workflow use go [here](../Going-further.md). diff --git a/doc/Assembly-Mode/Trio-tutorial.md b/doc/Assembly-Mode/Trio-tutorial.md deleted file mode 100644 index 027f4bf..0000000 --- a/doc/Assembly-Mode/Trio-tutorial.md +++ /dev/null @@ -1,47 +0,0 @@ -# Trio mode tutorial - -Please look at [quick start](../Quick-start.md) first, some of the steps are omitted here. - -This tutorial shows how to use the workflow with hi-c assembly mode which takes PacBio Hifi data and Hi-C data as input. - -## 1. Config file -**TO-DO : add a toy dataset fasta and parental fasta.** -```bash -cd GenomAsm4pg/.config -``` - -Modify `masterconfig.yaml`. The PacBio HiFi file is `toy_dataset_trio.fasta`, its name is used as key in config. The parental reads files are `data_p1.fasta` and `data_p2.fasta`. -Parental data is used as k-mers, you use Illumina or PacBio Hifi reads. - -```yaml -####################### job - workflow ####################### -### CONFIG - -IDS: ["toy_dataset_trio"] - -toy_dataset_trio: - fasta: ./GenomAsm4pg/tutorial_data/trio/toy_dataset_trio.fasta - run: trio_tutorial - ploidy: 2 - busco_lineage: eudicots_odb10 - mode: trio - p1: ./GenomAsm4pg/tutorial_data/trio/data_p1.fasta - p2: ./GenomAsm4pg/tutorial_data/trio/data_p2.fasta -``` - -## 2. Dry run -To check the config, first do a dry run of the workflow. - -```bash -sbatch job.sh dry -``` -## 3. Run -If the dry run is successful, you can run the workflow. - -```bash -sbatch job.sh -``` - -## Other assembly modes -If you want to use Hi-C data, follow the [Hi-C assembly mode tutorial](Hi-C-tutorial.md). -To go further with the workflow use go [here](../Going-further.md). diff --git a/doc/documentation.md b/doc/documentation.md index 4595f8a..d56311f 100644 --- a/doc/documentation.md +++ b/doc/documentation.md @@ -1,17 +1,21 @@ -# <A HREF="https://forgemia.inra.fr/asm4pg/GenomAsm4pg"> asm4pg </A> +# [asm4pg](https://forgemia.inra.fr/asm4pg/GenomAsm4pg) Asm4pg is an automatic and reproducible genome assembly workflow for pangenomic applications using PacBio HiFi data. -doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/genomasm4pg) +Documentation: [GitLab Pages](https://asm4pg.pages.mia.inra.fr/genomasm4pg) -## All options -Asm4pg has many options with default values, if you wish to modify them, refer to the [Going-Further](doc/going_further.md) section. +## All Options + +Asm4pg has many options with default values. To modify them, refer to the [Going Further](doc/going_further.md) section. ## Outputs -If you want to now more about the outputed files of the workflow, refer to the [Outputs](doc/outputs.md) section. -## Known errors -You may run into [these errors](doc/known_errors.md) +To learn more about the workflow's output files, refer to the [Outputs](doc/outputs.md) section. + +## Known Errors + +You may encounter [these errors](doc/known_errors.md). + +## Software -## Softwares -[Softwares used in the workflow](doc/software_list.md) +Here is a list of [software used in the workflow](doc/software_list.md) \ No newline at end of file diff --git a/doc/fig/rule_dag.svg b/doc/fig/rule_dag.svg deleted file mode 100644 index b225cdd..0000000 --- a/doc/fig/rule_dag.svg +++ /dev/null @@ -1,493 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" - "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> -<!-- Generated by graphviz version 2.40.1 (20161225.0304) - --> -<!-- Title: snakemake_dag Pages: 1 --> -<svg width="1382pt" height="620pt" - viewBox="0.00 0.00 1382.00 620.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> -<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 616)"> -<title>snakemake_dag</title> -<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-616 1378,-616 1378,4 -4,4"/> -<!-- 0 --> -<g id="node1" class="node"> -<title>0</title> -<path fill="none" stroke="#d87556" stroke-width="2" d="M784,-36C784,-36 754,-36 754,-36 748,-36 742,-30 742,-24 742,-24 742,-12 742,-12 742,-6 748,0 754,0 754,0 784,0 784,0 790,0 796,-6 796,-12 796,-12 796,-24 796,-24 796,-30 790,-36 784,-36"/> -<text text-anchor="middle" x="769" y="-15.5" font-family="sans" font-size="10.00" fill="#000000">all</text> -</g> -<!-- 1 --> -<g id="node2" class="node"> -<title>1</title> -<path fill="none" stroke="#d8cb56" stroke-width="2" d="M136,-252C136,-252 12,-252 12,-252 6,-252 0,-246 0,-240 0,-240 0,-228 0,-228 0,-222 6,-216 12,-216 12,-216 136,-216 136,-216 142,-216 148,-222 148,-228 148,-228 148,-240 148,-240 148,-246 142,-252 136,-252"/> -<text text-anchor="middle" x="74" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">genometools_on_raw_data</text> -</g> -<!-- 1->0 --> -<g id="edge15" class="edge"> -<title>1->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M99.5393,-215.8176C149.1815,-181.4737 263.2946,-107.3462 371,-72 437.7811,-50.0841 646.1884,-29.1986 731.7583,-21.3109"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="732.1653,-24.7884 741.8051,-20.3926 731.5281,-17.8174 732.1653,-24.7884"/> -</g> -<!-- 20 --> -<g id="node21" class="node"> -<title>20</title> -<path fill="none" stroke="#56d8c9" stroke-width="2" d="M711,-180C711,-180 681,-180 681,-180 675,-180 669,-174 669,-168 669,-168 669,-156 669,-156 669,-150 675,-144 681,-144 681,-144 711,-144 711,-144 717,-144 723,-150 723,-156 723,-156 723,-168 723,-168 723,-174 717,-180 711,-180"/> -<text text-anchor="middle" x="696" y="-159.5" font-family="sans" font-size="10.00" fill="#000000">report</text> -</g> -<!-- 1->20 --> -<g id="edge46" class="edge"> -<title>1->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M148.133,-217.4858C151.1229,-216.9584 154.0872,-216.4602 157,-216 344.4541,-186.3819 571.2971,-169.904 658.9529,-164.254"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="659.1831,-167.7465 668.9406,-163.6185 658.7385,-160.7607 659.1831,-167.7465"/> -</g> -<!-- 2 --> -<g id="node3" class="node"> -<title>2</title> -<path fill="none" stroke="#59d856" stroke-width="2" d="M735.5,-252C735.5,-252 674.5,-252 674.5,-252 668.5,-252 662.5,-246 662.5,-240 662.5,-240 662.5,-228 662.5,-228 662.5,-222 668.5,-216 674.5,-216 674.5,-216 735.5,-216 735.5,-216 741.5,-216 747.5,-222 747.5,-228 747.5,-228 747.5,-240 747.5,-240 747.5,-246 741.5,-252 735.5,-252"/> -<text text-anchor="middle" x="705" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">genomescope</text> -</g> -<!-- 2->0 --> -<g id="edge13" class="edge"> -<title>2->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M715.2716,-215.9496C720.8057,-205.6614 727.418,-192.3864 732,-180 748.7262,-134.7848 759.3817,-79.5701 764.7649,-46.6773"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="768.2812,-46.8494 766.3895,-36.4248 761.3675,-45.7538 768.2812,-46.8494"/> -</g> -<!-- 2->20 --> -<g id="edge44" class="edge"> -<title>2->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M702.7289,-215.8314C701.7664,-208.131 700.6218,-198.9743 699.5521,-190.4166"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="703.0151,-189.9019 698.3017,-180.4133 696.0691,-190.7702 703.0151,-189.9019"/> -</g> -<!-- 3 --> -<g id="node4" class="node"> -<title>3</title> -<path fill="none" stroke="#5673d8" stroke-width="2" d="M720.5,-324C720.5,-324 689.5,-324 689.5,-324 683.5,-324 677.5,-318 677.5,-312 677.5,-312 677.5,-300 677.5,-300 677.5,-294 683.5,-288 689.5,-288 689.5,-288 720.5,-288 720.5,-288 726.5,-288 732.5,-294 732.5,-300 732.5,-300 732.5,-312 732.5,-312 732.5,-318 726.5,-324 720.5,-324"/> -<text text-anchor="middle" x="705" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">jellyfish</text> -</g> -<!-- 3->2 --> -<g id="edge16" class="edge"> -<title>3->2</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M705,-287.8314C705,-280.131 705,-270.9743 705,-262.4166"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="708.5001,-262.4132 705,-252.4133 701.5001,-262.4133 708.5001,-262.4132"/> -</g> -<!-- 9 --> -<g id="node10" class="node"> -<title>9</title> -<path fill="none" stroke="#56d873" stroke-width="2" d="M632,-252C632,-252 602,-252 602,-252 596,-252 590,-246 590,-240 590,-240 590,-228 590,-228 590,-222 596,-216 602,-216 602,-216 632,-216 632,-216 638,-216 644,-222 644,-228 644,-228 644,-240 644,-240 644,-246 638,-252 632,-252"/> -<text text-anchor="middle" x="617" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">kat</text> -</g> -<!-- 3->9 --> -<g id="edge21" class="edge"> -<title>3->9</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M682.7939,-287.8314C671.955,-278.9632 658.7556,-268.1637 647.0322,-258.5718"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="649.155,-255.7864 639.199,-252.1628 644.7223,-261.2041 649.155,-255.7864"/> -</g> -<!-- 15 --> -<g id="node16" class="node"> -<title>15</title> -<path fill="none" stroke="#d86656" stroke-width="2" d="M927.5,-252C927.5,-252 886.5,-252 886.5,-252 880.5,-252 874.5,-246 874.5,-240 874.5,-240 874.5,-228 874.5,-228 874.5,-222 880.5,-216 886.5,-216 886.5,-216 927.5,-216 927.5,-216 933.5,-216 939.5,-222 939.5,-228 939.5,-228 939.5,-240 939.5,-240 939.5,-246 933.5,-252 927.5,-252"/> -<text text-anchor="middle" x="907" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">purge_kat</text> -</g> -<!-- 3->15 --> -<g id="edge29" class="edge"> -<title>3->15</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M732.5906,-292.313C736.0602,-290.7755 739.5886,-289.2979 743,-288 793.4297,-268.8133 810.6621,-271.161 865.0412,-252.1161"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="866.2502,-255.4008 874.4805,-248.7291 863.886,-248.8121 866.2502,-255.4008"/> -</g> -<!-- 4 --> -<g id="node5" class="node"> -<title>4</title> -<path fill="none" stroke="#5663d8" stroke-width="2" d="M304,-252C304,-252 178,-252 178,-252 172,-252 166,-246 166,-240 166,-240 166,-228 166,-228 166,-222 172,-216 178,-216 178,-216 304,-216 304,-216 310,-216 316,-222 316,-228 316,-228 316,-240 316,-240 316,-246 310,-252 304,-252"/> -<text text-anchor="middle" x="241" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">genometools_on_assembly</text> -</g> -<!-- 4->0 --> -<g id="edge11" class="edge"> -<title>4->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M256.2603,-215.7724C286.216,-181.3512 356.4925,-107.0975 433,-72 485.8272,-47.7658 655.9142,-28.8862 731.8209,-21.4448"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="732.2681,-24.918 741.8846,-20.4713 731.594,-17.9505 732.2681,-24.918"/> -</g> -<!-- 4->20 --> -<g id="edge42" class="edge"> -<title>4->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M316.0569,-217.6576C319.0743,-217.0826 322.0642,-216.5276 325,-216 446.618,-194.1436 591.8842,-175.007 658.8425,-166.576"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="659.4136,-170.0319 668.901,-165.316 658.5434,-163.0862 659.4136,-170.0319"/> -</g> -<!-- 5 --> -<g id="node6" class="node"> -<title>5</title> -<path fill="none" stroke="#9fd856" stroke-width="2" d="M628.5,-540C628.5,-540 551.5,-540 551.5,-540 545.5,-540 539.5,-534 539.5,-528 539.5,-528 539.5,-516 539.5,-516 539.5,-510 545.5,-504 551.5,-504 551.5,-504 628.5,-504 628.5,-504 634.5,-504 640.5,-510 640.5,-516 640.5,-516 640.5,-528 640.5,-528 640.5,-534 634.5,-540 628.5,-540"/> -<text text-anchor="middle" x="590" y="-519.5" font-family="sans" font-size="10.00" fill="#000000">hap_gfa_to_fasta</text> -</g> -<!-- 5->4 --> -<g id="edge17" class="edge"> -<title>5->4</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M555.689,-503.8043C538.1887,-493.9913 516.8811,-481.2054 499,-468 406.8037,-399.9121 309.7037,-304.3778 265.4985,-259.3605"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="267.8629,-256.7723 258.3688,-252.0699 262.8583,-261.6666 267.8629,-256.7723"/> -</g> -<!-- 8 --> -<g id="node9" class="node"> -<title>8</title> -<path fill="none" stroke="#5692d8" stroke-width="2" d="M487,-324C487,-324 415,-324 415,-324 409,-324 403,-318 403,-312 403,-312 403,-300 403,-300 403,-294 409,-288 415,-288 415,-288 487,-288 487,-288 493,-288 499,-294 499,-300 499,-300 499,-312 499,-312 499,-318 493,-324 487,-324"/> -<text text-anchor="middle" x="451" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">unzip_hap_fasta</text> -</g> -<!-- 5->8 --> -<g id="edge20" class="edge"> -<title>5->8</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M563.6467,-503.9739C551.0609,-494.3891 536.494,-481.7538 526,-468 493.9697,-426.0199 471.354,-368.1635 459.8086,-334.1571"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="463.0483,-332.8064 456.5828,-324.4131 456.403,-335.0064 463.0483,-332.8064"/> -</g> -<!-- 5->9 --> -<g id="edge22" class="edge"> -<title>5->9</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M593.0424,-503.5454C594.6818,-493.1306 596.641,-479.8638 598,-468 606.4665,-394.086 612.5189,-306.6126 615.2955,-262.4929"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="618.8053,-262.442 615.9303,-252.2447 611.8187,-262.0091 618.8053,-262.442"/> -</g> -<!-- 12 --> -<g id="node13" class="node"> -<title>12</title> -<path fill="none" stroke="#56d8b9" stroke-width="2" d="M1021.5,-396C1021.5,-396 972.5,-396 972.5,-396 966.5,-396 960.5,-390 960.5,-384 960.5,-384 960.5,-372 960.5,-372 960.5,-366 966.5,-360 972.5,-360 972.5,-360 1021.5,-360 1021.5,-360 1027.5,-360 1033.5,-366 1033.5,-372 1033.5,-372 1033.5,-384 1033.5,-384 1033.5,-390 1027.5,-396 1021.5,-396"/> -<text text-anchor="middle" x="997" y="-375.5" font-family="sans" font-size="10.00" fill="#000000">purge_dups</text> -</g> -<!-- 5->12 --> -<g id="edge26" class="edge"> -<title>5->12</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M640.7239,-504.0535C720.3611,-475.8772 873.5598,-421.6742 950.5018,-394.4515"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="952.1308,-397.5878 960.3907,-390.9527 949.7959,-390.9886 952.1308,-397.5878"/> -</g> -<!-- 13 --> -<g id="node14" class="node"> -<title>13</title> -<path fill="none" stroke="#5682d8" stroke-width="2" d="M1040.5,-468C1040.5,-468 953.5,-468 953.5,-468 947.5,-468 941.5,-462 941.5,-456 941.5,-456 941.5,-444 941.5,-444 941.5,-438 947.5,-432 953.5,-432 953.5,-432 1040.5,-432 1040.5,-432 1046.5,-432 1052.5,-438 1052.5,-444 1052.5,-444 1052.5,-456 1052.5,-456 1052.5,-462 1046.5,-468 1040.5,-468"/> -<text text-anchor="middle" x="997" y="-447.5" font-family="sans" font-size="10.00" fill="#000000">purge_dups_cutoffs</text> -</g> -<!-- 5->13 --> -<g id="edge27" class="edge"> -<title>5->13</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M640.7239,-513.0267C714.1887,-500.0305 850.2539,-475.96 931.3067,-461.6214"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="932.1581,-465.0252 941.3955,-459.8367 930.9387,-458.1323 932.1581,-465.0252"/> -</g> -<!-- 23 --> -<g id="node24" class="node"> -<title>23</title> -<path fill="none" stroke="#afd856" stroke-width="2" d="M577,-468C577,-468 547,-468 547,-468 541,-468 535,-462 535,-456 535,-456 535,-444 535,-444 535,-438 541,-432 547,-432 547,-432 577,-432 577,-432 583,-432 589,-438 589,-444 589,-444 589,-456 589,-456 589,-462 583,-468 577,-468"/> -<text text-anchor="middle" x="562" y="-447.5" font-family="sans" font-size="10.00" fill="#000000">cp_hap</text> -</g> -<!-- 5->23 --> -<g id="edge49" class="edge"> -<title>5->23</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M582.9344,-503.8314C579.874,-495.9617 576.2221,-486.5712 572.8318,-477.8533"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="576.0473,-476.4647 569.1607,-468.4133 569.5232,-479.0019 576.0473,-476.4647"/> -</g> -<!-- 6 --> -<g id="node7" class="node"> -<title>6</title> -<path fill="none" stroke="#d8bc56" stroke-width="2" d="M605,-612C605,-612 575,-612 575,-612 569,-612 563,-606 563,-600 563,-600 563,-588 563,-588 563,-582 569,-576 575,-576 575,-576 605,-576 605,-576 611,-576 617,-582 617,-588 617,-588 617,-600 617,-600 617,-606 611,-612 605,-612"/> -<text text-anchor="middle" x="590" y="-591.5" font-family="sans" font-size="10.00" fill="#000000">hifiasm</text> -</g> -<!-- 6->5 --> -<g id="edge18" class="edge"> -<title>6->5</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M590,-575.8314C590,-568.131 590,-558.9743 590,-550.4166"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="593.5001,-550.4132 590,-540.4133 586.5001,-550.4133 593.5001,-550.4132"/> -</g> -<!-- 7 --> -<g id="node8" class="node"> -<title>7</title> -<path fill="none" stroke="#bed856" stroke-width="2" d="M376,-252C376,-252 346,-252 346,-252 340,-252 334,-246 334,-240 334,-240 334,-228 334,-228 334,-222 340,-216 346,-216 346,-216 376,-216 376,-216 382,-216 388,-222 388,-228 388,-228 388,-240 388,-240 388,-246 382,-252 376,-252"/> -<text text-anchor="middle" x="361" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">busco</text> -</g> -<!-- 7->0 --> -<g id="edge4" class="edge"> -<title>7->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M368.6017,-215.9044C383.975,-181.7088 422.1989,-107.824 479,-72 520.5285,-45.8083 662.5351,-28.5579 731.208,-21.5482"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="731.9713,-24.9894 741.5733,-20.5113 731.2745,-18.0241 731.9713,-24.9894"/> -</g> -<!-- 7->20 --> -<g id="edge37" class="edge"> -<title>7->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M388.131,-219.7716C391.4033,-218.3645 394.741,-217.0656 398,-216 489.0354,-186.2333 601.4018,-171.4869 658.6479,-165.4566"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="659.0862,-168.9301 668.678,-164.4304 658.3736,-161.9665 659.0862,-168.9301"/> -</g> -<!-- 8->7 --> -<g id="edge19" class="edge"> -<title>8->7</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M428.2892,-287.8314C417.204,-278.9632 403.7046,-268.1637 391.7148,-258.5718"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="393.6987,-255.6768 383.7035,-252.1628 389.3258,-261.1429 393.6987,-255.6768"/> -</g> -<!-- 10 --> -<g id="node11" class="node"> -<title>10</title> -<path fill="none" stroke="#c6d856" stroke-width="2" d="M483.5,-252C483.5,-252 418.5,-252 418.5,-252 412.5,-252 406.5,-246 406.5,-240 406.5,-240 406.5,-228 406.5,-228 406.5,-222 412.5,-216 418.5,-216 418.5,-216 483.5,-216 483.5,-216 489.5,-216 495.5,-222 495.5,-228 495.5,-228 495.5,-240 495.5,-240 495.5,-246 489.5,-252 483.5,-252"/> -<text text-anchor="middle" x="451" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">find_telomeres</text> -</g> -<!-- 8->10 --> -<g id="edge23" class="edge"> -<title>8->10</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M451,-287.8314C451,-280.131 451,-270.9743 451,-262.4166"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="454.5001,-262.4132 451,-252.4133 447.5001,-262.4133 454.5001,-262.4132"/> -</g> -<!-- 9->0 --> -<g id="edge14" class="edge"> -<title>9->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M614.6552,-215.7578C611.5127,-183.239 609.7096,-114.6717 643,-72 664.5778,-44.3415 703.0788,-30.6313 731.8617,-23.9764"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="732.9207,-27.3297 741.9769,-21.8311 731.4683,-20.4821 732.9207,-27.3297"/> -</g> -<!-- 9->20 --> -<g id="edge45" class="edge"> -<title>9->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M636.935,-215.8314C646.4783,-207.1337 658.0599,-196.5783 668.4306,-187.1265"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="671.038,-189.4857 676.0713,-180.1628 666.3228,-184.3121 671.038,-189.4857"/> -</g> -<!-- 10->0 --> -<g id="edge1" class="edge"> -<title>10->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M452.0719,-215.7403C455.0875,-181.9324 466.55,-109.6148 510,-72 542.8135,-43.5933 668.3208,-27.6995 731.7871,-21.3245"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="732.1709,-24.8038 741.7831,-20.3478 731.4901,-17.8369 732.1709,-24.8038"/> -</g> -<!-- 10->20 --> -<g id="edge35" class="edge"> -<title>10->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M495.6519,-220.8778C542.5168,-207.1053 615.3375,-185.7049 659.0751,-172.8514"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="660.1331,-176.1886 668.7405,-170.011 658.1594,-169.4726 660.1331,-176.1886"/> -</g> -<!-- 11 --> -<g id="node12" class="node"> -<title>11</title> -<path fill="none" stroke="#80d856" stroke-width="2" d="M1024,-252C1024,-252 970,-252 970,-252 964,-252 958,-246 958,-240 958,-240 958,-228 958,-228 958,-222 964,-216 970,-216 970,-216 1024,-216 1024,-216 1030,-216 1036,-222 1036,-228 1036,-228 1036,-240 1036,-240 1036,-246 1030,-252 1024,-252"/> -<text text-anchor="middle" x="997" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">purge_busco</text> -</g> -<!-- 11->0 --> -<g id="edge7" class="edge"> -<title>11->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M978.5547,-215.8096C947.8417,-185.6125 884.1418,-123.3716 829,-72 818.6228,-62.3323 807.0734,-51.8691 796.9004,-42.7503"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="799.2199,-40.1292 789.4315,-36.0746 794.555,-45.3483 799.2199,-40.1292"/> -</g> -<!-- 11->20 --> -<g id="edge39" class="edge"> -<title>11->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M957.8675,-218.7845C954.8839,-217.7907 951.9043,-216.8494 949,-216 873.5413,-193.9312 782.9846,-176.7947 733.259,-168.1638"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="733.736,-164.6945 723.288,-166.4524 732.5518,-171.5936 733.736,-164.6945"/> -</g> -<!-- 12->11 --> -<g id="edge24" class="edge"> -<title>12->11</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M997,-359.7623C997,-335.201 997,-291.2474 997,-262.3541"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="1000.5001,-262.0896 997,-252.0896 993.5001,-262.0897 1000.5001,-262.0896"/> -</g> -<!-- 14 --> -<g id="node15" class="node"> -<title>14</title> -<path fill="none" stroke="#d8ac56" stroke-width="2" d="M1155.5,-252C1155.5,-252 1066.5,-252 1066.5,-252 1060.5,-252 1054.5,-246 1054.5,-240 1054.5,-240 1054.5,-228 1054.5,-228 1054.5,-222 1060.5,-216 1066.5,-216 1066.5,-216 1155.5,-216 1155.5,-216 1161.5,-216 1167.5,-222 1167.5,-228 1167.5,-228 1167.5,-240 1167.5,-240 1167.5,-246 1161.5,-252 1155.5,-252"/> -<text text-anchor="middle" x="1111" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">purge_genometools</text> -</g> -<!-- 12->14 --> -<g id="edge28" class="edge"> -<title>12->14</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1011.4382,-359.7623C1031.3965,-334.5518 1067.5293,-288.9103 1090.3413,-260.0952"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="1093.2162,-262.1026 1096.6791,-252.0896 1087.7278,-257.7576 1093.2162,-262.1026"/> -</g> -<!-- 12->15 --> -<g id="edge30" class="edge"> -<title>12->15</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M985.6014,-359.7623C969.9802,-334.7682 941.8077,-289.6924 923.7755,-260.8409"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="926.574,-258.7146 918.306,-252.0896 920.638,-262.4246 926.574,-258.7146"/> -</g> -<!-- 16 --> -<g id="node17" class="node"> -<title>16</title> -<path fill="none" stroke="#d88d56" stroke-width="2" d="M1296,-252C1296,-252 1198,-252 1198,-252 1192,-252 1186,-246 1186,-240 1186,-240 1186,-228 1186,-228 1186,-222 1192,-216 1198,-216 1198,-216 1296,-216 1296,-216 1302,-216 1308,-222 1308,-228 1308,-228 1308,-240 1308,-240 1308,-246 1302,-252 1296,-252"/> -<text text-anchor="middle" x="1247" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">purge_find_telomeres</text> -</g> -<!-- 12->16 --> -<g id="edge31" class="edge"> -<title>12->16</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1028.4095,-359.9081C1073.5952,-333.8812 1156.9584,-285.8639 1206.652,-257.2404"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="1208.6039,-260.1553 1215.5223,-252.1311 1205.11,-254.0896 1208.6039,-260.1553"/> -</g> -<!-- 18 --> -<g id="node19" class="node"> -<title>18</title> -<path fill="none" stroke="#8fd856" stroke-width="2" d="M1279.5,-180C1279.5,-180 1216.5,-180 1216.5,-180 1210.5,-180 1204.5,-174 1204.5,-168 1204.5,-168 1204.5,-156 1204.5,-156 1204.5,-150 1210.5,-144 1216.5,-144 1216.5,-144 1279.5,-144 1279.5,-144 1285.5,-144 1291.5,-150 1291.5,-156 1291.5,-156 1291.5,-168 1291.5,-168 1291.5,-174 1285.5,-180 1279.5,-180"/> -<text text-anchor="middle" x="1248" y="-159.5" font-family="sans" font-size="10.00" fill="#000000">link_final_asm</text> -</g> -<!-- 12->18 --> -<g id="edge33" class="edge"> -<title>12->18</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1033.7241,-369.0603C1110.5371,-349.5387 1283.8146,-300.9154 1317,-252 1325.9827,-238.7595 1324.008,-230.3836 1317,-216 1311.2696,-204.2386 1301.6013,-194.2849 1291.2879,-186.2518"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="1293.0974,-183.2412 1282.9415,-180.2233 1288.9987,-188.9159 1293.0974,-183.2412"/> -</g> -<!-- 26 --> -<g id="node27" class="node"> -<title>26</title> -<path fill="none" stroke="#56d882" stroke-width="2" d="M916.5,-324C916.5,-324 879.5,-324 879.5,-324 873.5,-324 867.5,-318 867.5,-312 867.5,-312 867.5,-300 867.5,-300 867.5,-294 873.5,-288 879.5,-288 879.5,-288 916.5,-288 916.5,-288 922.5,-288 928.5,-294 928.5,-300 928.5,-300 928.5,-312 928.5,-312 928.5,-318 922.5,-324 916.5,-324"/> -<text text-anchor="middle" x="898" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">purge_cp</text> -</g> -<!-- 12->26 --> -<g id="edge53" class="edge"> -<title>12->26</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M972.0181,-359.8314C959.7072,-350.8779 944.6893,-339.9558 931.4063,-330.2955"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="933.1199,-327.214 922.9739,-324.1628 929.0027,-332.8752 933.1199,-327.214"/> -</g> -<!-- 13->12 --> -<g id="edge25" class="edge"> -<title>13->12</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M997,-431.8314C997,-424.131 997,-414.9743 997,-406.4166"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="1000.5001,-406.4132 997,-396.4133 993.5001,-406.4133 1000.5001,-406.4132"/> -</g> -<!-- 17 --> -<g id="node18" class="node"> -<title>17</title> -<path fill="none" stroke="#56c9d8" stroke-width="2" d="M1299,-108C1299,-108 1247,-108 1247,-108 1241,-108 1235,-102 1235,-96 1235,-96 1235,-84 1235,-84 1235,-78 1241,-72 1247,-72 1247,-72 1299,-72 1299,-72 1305,-72 1311,-78 1311,-84 1311,-84 1311,-96 1311,-96 1311,-102 1305,-108 1299,-108"/> -<text text-anchor="middle" x="1273" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">cutoffs_eval</text> -</g> -<!-- 13->17 --> -<g id="edge32" class="edge"> -<title>13->17</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1052.6276,-440.7111C1157.4965,-421.6152 1374,-373.9615 1374,-306 1374,-306 1374,-306 1374,-234 1374,-184.9451 1334.2284,-140.7517 1304.5999,-114.6353"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="1306.809,-111.9191 1296.9356,-108.0763 1302.2576,-117.2375 1306.809,-111.9191"/> -</g> -<!-- 14->0 --> -<g id="edge12" class="edge"> -<title>14->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1083.8769,-215.8627C1038.2536,-185.4814 943.0854,-122.6518 861,-72 842.7811,-60.7578 822.2576,-48.6583 805.2255,-38.7678"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="806.5672,-35.5004 796.1591,-33.5198 803.0604,-41.5587 806.5672,-35.5004"/> -</g> -<!-- 14->20 --> -<g id="edge43" class="edge"> -<title>14->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1054.2177,-218.0772C1051.1077,-217.3417 1048.0202,-216.6434 1045,-216 932.5289,-192.0416 797.5267,-174.2035 733.3801,-166.378"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="733.6896,-162.89 723.3419,-165.1648 732.8496,-169.8394 733.6896,-162.89"/> -</g> -<!-- 15->0 --> -<g id="edge2" class="edge"> -<title>15->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M895.4716,-215.9555C871.1694,-177.9173 814.5803,-89.3431 786.0664,-44.7127"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="788.9261,-42.6877 780.5927,-36.1451 783.0272,-46.4564 788.9261,-42.6877"/> -</g> -<!-- 15->20 --> -<g id="edge36" class="edge"> -<title>15->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M874.455,-219.3397C871.6109,-218.1719 868.764,-217.0424 866,-216 820.7722,-198.9437 767.4965,-182.6111 732.8015,-172.4609"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="733.6753,-169.07 723.0959,-169.6409 731.7222,-175.7921 733.6753,-169.07"/> -</g> -<!-- 16->0 --> -<g id="edge8" class="edge"> -<title>16->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1210.28,-215.968C1147.5514,-185.3424 1015.519,-121.6554 902,-72 869.6675,-57.8572 832.4089,-42.8375 805.5673,-32.2387"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="806.7651,-28.9488 796.178,-28.5436 804.2017,-35.4626 806.7651,-28.9488"/> -</g> -<!-- 16->20 --> -<g id="edge40" class="edge"> -<title>16->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1185.6478,-217.7061C1182.7321,-217.0948 1179.8385,-216.5216 1177,-216 1013.073,-185.8748 814.4224,-170.015 733.2586,-164.3992"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="733.2652,-160.8917 723.0506,-163.7044 732.7898,-167.8755 733.2652,-160.8917"/> -</g> -<!-- 17->0 --> -<g id="edge3" class="edge"> -<title>17->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1234.7596,-80.8678C1221.0144,-77.7812 1205.3608,-74.4899 1191,-72 1049.844,-47.5257 880.336,-29.1392 806.4687,-21.6649"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="806.5787,-18.1585 796.279,-20.6419 805.8793,-25.1234 806.5787,-18.1585"/> -</g> -<!-- 18->0 --> -<g id="edge5" class="edge"> -<title>18->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M1221.3691,-143.9882C1189.5927,-123.3636 1134.1893,-90.0996 1082,-72 985.9157,-38.6774 865.8257,-25.3411 806.1861,-20.4886"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="806.3573,-16.9914 796.1156,-19.7038 805.8134,-23.9703 806.3573,-16.9914"/> -</g> -<!-- 19 --> -<g id="node20" class="node"> -<title>19</title> -<path fill="none" stroke="#56d8a2" stroke-width="2" d="M728.5,-108C728.5,-108 663.5,-108 663.5,-108 657.5,-108 651.5,-102 651.5,-96 651.5,-96 651.5,-84 651.5,-84 651.5,-78 657.5,-72 663.5,-72 663.5,-72 728.5,-72 728.5,-72 734.5,-72 740.5,-78 740.5,-84 740.5,-84 740.5,-96 740.5,-96 740.5,-102 734.5,-108 728.5,-108"/> -<text text-anchor="middle" x="696" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">rename_report</text> -</g> -<!-- 19->0 --> -<g id="edge10" class="edge"> -<title>19->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M714.421,-71.8314C723.1529,-63.219 733.7317,-52.7851 743.2423,-43.4048"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="745.923,-45.6769 750.5849,-36.1628 741.0075,-40.6931 745.923,-45.6769"/> -</g> -<!-- 20->19 --> -<g id="edge34" class="edge"> -<title>20->19</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M696,-143.8314C696,-136.131 696,-126.9743 696,-118.4166"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="699.5001,-118.4132 696,-108.4133 692.5001,-118.4133 699.5001,-118.4132"/> -</g> -<!-- 21 --> -<g id="node22" class="node"> -<title>21</title> -<path fill="none" stroke="#d88556" stroke-width="2" d="M578,-324C578,-324 544,-324 544,-324 538,-324 532,-318 532,-312 532,-312 532,-300 532,-300 532,-294 538,-288 544,-288 544,-288 578,-288 578,-288 584,-288 590,-294 590,-300 590,-300 590,-312 590,-312 590,-318 584,-324 578,-324"/> -<text text-anchor="middle" x="561" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">merqury</text> -</g> -<!-- 21->0 --> -<g id="edge6" class="edge"> -<title>21->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M560.5729,-287.5068C560.3533,-277.08 560.1094,-263.814 560,-252 559.2193,-167.6762 558.984,-130.2079 620,-72 650.7188,-42.6949 698.5374,-29.0986 731.6826,-22.9208"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="732.6263,-26.31 741.8881,-21.1652 731.4395,-19.4113 732.6263,-26.31"/> -</g> -<!-- 21->20 --> -<g id="edge38" class="edge"> -<title>21->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M560.5269,-287.7828C560.9552,-268.0437 564.4916,-236.6148 581,-216 600.4744,-191.6814 633.4687,-177.7591 659.1348,-170.1642"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="660.2107,-173.4982 668.9196,-167.4646 658.3489,-166.7503 660.2107,-173.4982"/> -</g> -<!-- 22 --> -<g id="node23" class="node"> -<title>22</title> -<path fill="none" stroke="#56a2d8" stroke-width="2" d="M751,-396C751,-396 721,-396 721,-396 715,-396 709,-390 709,-384 709,-384 709,-372 709,-372 709,-366 715,-360 721,-360 721,-360 751,-360 751,-360 757,-360 763,-366 763,-372 763,-372 763,-384 763,-384 763,-390 757,-396 751,-396"/> -<text text-anchor="middle" x="736" y="-375.5" font-family="sans" font-size="10.00" fill="#000000">meryl</text> -</g> -<!-- 22->21 --> -<g id="edge48" class="edge"> -<title>22->21</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M708.6563,-366.75C679.3459,-354.6909 632.5808,-335.4504 599.5445,-321.8583"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="600.7721,-318.5788 590.1925,-318.0106 598.1087,-325.0523 600.7721,-318.5788"/> -</g> -<!-- 25 --> -<g id="node26" class="node"> -<title>25</title> -<path fill="none" stroke="#d89c56" stroke-width="2" d="M834.5,-324C834.5,-324 763.5,-324 763.5,-324 757.5,-324 751.5,-318 751.5,-312 751.5,-312 751.5,-300 751.5,-300 751.5,-294 757.5,-288 763.5,-288 763.5,-288 834.5,-288 834.5,-288 840.5,-288 846.5,-294 846.5,-300 846.5,-300 846.5,-312 846.5,-312 846.5,-318 840.5,-324 834.5,-324"/> -<text text-anchor="middle" x="799" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">purge_cp_meryl</text> -</g> -<!-- 22->25 --> -<g id="edge52" class="edge"> -<title>22->25</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M751.8976,-359.8314C759.2278,-351.454 768.066,-341.3531 776.0969,-332.1749"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="778.9373,-334.2438 782.8884,-324.4133 773.6693,-329.6343 778.9373,-334.2438"/> -</g> -<!-- 23->21 --> -<g id="edge47" class="edge"> -<title>23->21</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M561.8733,-431.7623C561.7028,-407.201 561.3976,-363.2474 561.1969,-334.3541"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="564.6951,-334.065 561.1256,-324.0896 557.6952,-334.1137 564.6951,-334.065"/> -</g> -<!-- 24 --> -<g id="node25" class="node"> -<title>24</title> -<path fill="none" stroke="#70d856" stroke-width="2" d="M844.5,-252C844.5,-252 777.5,-252 777.5,-252 771.5,-252 765.5,-246 765.5,-240 765.5,-240 765.5,-228 765.5,-228 765.5,-222 771.5,-216 777.5,-216 777.5,-216 844.5,-216 844.5,-216 850.5,-216 856.5,-222 856.5,-228 856.5,-228 856.5,-240 856.5,-240 856.5,-246 850.5,-252 844.5,-252"/> -<text text-anchor="middle" x="811" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">purge_merqury</text> -</g> -<!-- 24->0 --> -<g id="edge9" class="edge"> -<title>24->0</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M808.3748,-215.927C803.9016,-185.8988 794.1919,-123.895 783,-72 781.1911,-63.6126 778.9576,-54.5632 776.8099,-46.3145"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="780.1269,-45.1686 774.1738,-36.4044 773.3621,-46.9681 780.1269,-45.1686"/> -</g> -<!-- 24->20 --> -<g id="edge41" class="edge"> -<title>24->20</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M781.9806,-215.8314C766.71,-206.2706 747.8539,-194.4651 731.6949,-184.3481"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="733.3682,-181.2664 723.0351,-178.9263 729.6536,-187.1995 733.3682,-181.2664"/> -</g> -<!-- 25->24 --> -<g id="edge51" class="edge"> -<title>25->24</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M802.0281,-287.8314C803.3115,-280.131 804.8376,-270.9743 806.2639,-262.4166"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="809.7394,-262.8526 807.9311,-252.4133 802.8347,-261.7018 809.7394,-262.8526"/> -</g> -<!-- 26->24 --> -<g id="edge50" class="edge"> -<title>26->24</title> -<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M876.0462,-287.8314C865.3305,-278.9632 852.2811,-268.1637 840.691,-258.5718"/> -<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="842.8822,-255.8422 832.9467,-252.1628 838.4192,-261.2349 842.8822,-255.8422"/> -</g> -</g> -</svg> diff --git a/doc/going_further.md b/doc/going_further.md index 6a7705d..fd4edda 100644 --- a/doc/going_further.md +++ b/doc/going_further.md @@ -1,52 +1,60 @@ -# Going further - -## 01. Job.sh/local_run.sh options -Usage: job.sh/local_run.sh [dry|run|dag|rulegraph|unlock] -- [dry] - run the specified Snakefile in dry-run mode -- [run] - run the specified Snakefile normally -- [dag] - generate the directed acyclic graph for the specified Snakefile -- [rulegraph] - generate the rulegraph for the specified Snakefile -- [unlock] - Unlock the directory if snakemake crashed - -## 02. Workflow options -Inside the ./.config/marsterconfig.yaml file you can add more options - -Here are all the options and their default values : -- `fasta_gz` : Your reads (mandatory) -- `mode`: [default, hi-c, trio] The mode for hifiasm assembly (default: default) - - `r1` if hi-c of trio mode the run1/parent1 read file - - `r2` if hi-c of trio mode the run2/parent2 read file -- `run_purge_dups` : [True, False] If set to true, the workflow will run [purge_dups](https://github.com/dfguan/purge_dups) on the assembly. (default: False) -- `busco_lineage` : The busco lineage of your organisms listed [here](https://busco.ezlab.org/list_of_lineages.html) (default: eukaryota_odb10) -- `ploidy` : The ploidy of the organims (default: 2) -- `run_ragtag` : [True, False] If set to true, the workflow will run RagTag and produce a scafold of the assemblies (default: False) - - `reference_genome` : The reference genome used for Quast and RagTag scafolding - - -/!\ Advanced options, use only if you have read the docs of the tools, we strogly advise keeping default values : -- `assembly_purge_force` : [1-3] the purge level of Hifiasm `-l` parametter, full description [here](https://hifiasm.readthedocs.io/en/latest/parameter-reference.html) (default: 3) -- `kmer_size` : The sizes of the kmers used for QC steps (default: 21) - -## 03. Example configurations -### Minimal config +# Going Further + +## 01. Job.sh/local_run.sh Options + +Usage: `job.sh/local_run.sh [dry|run|dag|rulegraph|unlock]` + +- `dry` - Run the specified Snakefile in dry-run mode +- `run` - Run the specified Snakefile normally +- `dag` - Generate the directed acyclic graph for the specified Snakefile +- `rulegraph` - Generate the rulegraph for the specified Snakefile +- `unlock` - Unlock the directory if Snakemake crashed + +## 02. Workflow Options + +Inside the `./.config/marsterconfig.yaml` file, you can add more options. +Here are all the options and their default values: + +- `fasta_gz`: Your reads (mandatory) +- `mode`: [default, hi-c, trio] The mode for Hifiasm assembly (default: default) +- `r1`: If hi-c or trio mode, the run1/parent1 read file +- `r2`: If hi-c or trio mode, the run2/parent2 read file +- `run_purge_dups`: [True, False] If set to true, the workflow will run [purge_dups](https://github.com/dfguan/purge_dups) on the assembly (default: False) +- `busco_lineage`: The BUSCO lineage of your organism listed [here](https://busco.ezlab.org/list_of_lineages.html) (default: eukaryota_odb10) +- `ploidy`: The ploidy of the organism (default: 2) +- `run_ragtag`: [True, False] If set to true, the workflow will run RagTag and produce a scaffold of the assemblies (default: False) +- `reference_genome`: The reference genome used for QUAST and RagTag scaffolding + +âš ï¸ Advanced options (use only if you have read the tools' documentation; we strongly advise keeping default values): + +- `assembly_purge_force`: [1-3] The purge level of Hifiasm `-l` parameter, full description [here](https://hifiasm.readthedocs.io/en/latest/parameter-reference.html) (default: 3) +- `kmer_size`: The sizes of the kmers used for QC steps (default: 21) + +## 03. Example Configurations + +### Minimal Config + +This minimal configuration will conduct a de novo assembly with default values: ```yaml samples: example1: fasta_gz: example.fasta.gz ``` -### Simple config +### Simple Config + +This simple configuration will conduct a de novo assembly with tailored values. **We recommend using this type of configuration:** ```yaml samples: example1: fasta_gz: example.fasta.gz busco_lineage: eudicots_odb10 - run_purge_dups: True run_ragtag: True reference_genome: ref.fasta.gz - ``` -### Hi-c config + +### Hi-C Config +This example shows how to use the workflow with Hi-C assembly mode which takes PacBio HiFi data and Hi-C data as input. ```yaml samples: example1: @@ -54,10 +62,10 @@ samples: mode: hi-c r1: run1.fasta.gz r2: run2.fasta.gz - ``` -### Trio config +### Trio Config +This example shows how to use the workflow with trio assembly mode. The parental reads files can be Illumina or PacBio HiFi reads. ```yaml samples: example1: @@ -67,7 +75,7 @@ samples: r2: parent2.fasta.gz ``` -### Adanced config +### Advanced Config ```yaml samples: example1: @@ -84,9 +92,9 @@ samples: reference_genome: ref.fasta.gz ``` -## 04. Run the workflow on multiple datasets -You can run the workflow on multiple datasets at the same time. +## 04. Run the Workflow on Multiple Datasets +You can run the workflow on multiple datasets at the same time. ```yaml samples: dataset_1: @@ -97,5 +105,4 @@ samples: run_purge_dups: False dataset_n: fasta_gz: example_n.fasta.gz -``` - +``` \ No newline at end of file diff --git a/doc/known_errors.md b/doc/known_errors.md index 856fe2b..b648666 100644 --- a/doc/known_errors.md +++ b/doc/known_errors.md @@ -1,18 +1,23 @@ # Troubleshooting -## One of the BUSCO rules failed -The first time you run the workflow, the BUSCO lineage might be downloaded multiple times. This can create a conflict between the jobs using BUSCO and may interrupt some of them. In that case, you only need to rerun the workflow once everything is done. +## BUSCO Rule Failures -## Snakemake locked directory -When you try to rerun the workflow after cancelling a job, you may have to unlock the results directory. To do so, go in `job.sh/local_run.sh` and uncomment `#--unlock`. Run the workflow once to unlock the directory (it should only take a few seconds). Still in `job.sh/local_run.sh`, re add the `#`. The workflow will be able to run and create outputs. +During first run, multiple simultaneous BUSCO lineage downloads may cause job conflicts. Simply rerun the workflow after completion to resolve this. -## HPC problems -The asm4pg.sh does not work with HPC that does not allow a job to run other jobs. +## Snakemake Locked Directory -If the version of SLRUM in the HPC is old, you may run into this error `srun: unrecognized option '--cpu-bind=q'` this is a known SLURM/Snakemake issue and SLRUM needs to be updated (https://github.com/snakemake/snakemake/issues/2071) +If workflow rerun fails after job cancellation: +1. Run `job.sh/local_run.sh unlock` +2. Then rerun the workflow normaly -A temporary sollution is to run the ./local_run.sh with sbatch : -``` +## HPC Problems + +The `job.sh` script is incompatible with HPCs that restrict job nesting. + +For older SLURM versions, you may encounter: `srun: unrecognized option '--cpu-bind=q'`. This is a [known SLURM/Snakemake issue](https://github.com/snakemake/snakemake/issues/2071) requiring SLURM update. + +Temporary workaround for thoses issues include using `local_run.sh` with sbatch: +```bash module load Singularity source activate wf_env sbatch ./local_run.sh dry diff --git a/doc/outputs.md b/doc/outputs.md index 669264a..c345d3b 100644 --- a/doc/outputs.md +++ b/doc/outputs.md @@ -1,49 +1,57 @@ -# Workflow output +# Workflow Output ## Directories -There are 4 directories for the data produced by the workflow: -- `01_raw_assembly` which contains the direct output of Hifiasm -- `02_final_assembly` which contains the assembled haplotypes that may have been purged of haplotigs and/or scafolded -- `03_raw_qc` which contains quality metrics for the reads. -- `04_assembly_qc` which contains quality metrics of the final assembly. +- `01_raw_assembly`: Direct Hifiasm output +- `02_final_assembly`: Processed haplotypes (purged/scaffolded) +- `03_raw_qc`: Read quality metrics +- `04_assembly_qc`: Final assembly quality metrics -## Files +## Files ```bash -results/ # Results folder containg all run -└── {sample}_results - ├── 01_raw_assembly # Raw assembly folder with gfa and fasta files - │  ├── {sample}_hap1.gfa - │ ├── {sample}_hap2.gfa - │  ├── {sample}_hap1.fasta.gz - │  ├── {sample}_hap2.fasta.gz - │  └──{sample}_hifiasm_benchmark.txt - ├── 02_final_assembly # Final assembly driectory with fasta file - │  ├── hap1 - │  │  ├── cutoffs - │  │  ├── ragtag_scafold # Driectory that contains scafolded haplotypes - │  │  │  └── recap.txt - │  │  └── {sample}_final_hap1.fasta.gz - │  └── hap2 - │  └──... - ├── 03_raw_data_qc # Driectory that contains QC on the reads - │  ├── genomescope - │  │  └── ... - │  ├── jellyfish - │  │  └── ... - │  └── {sample}_genometools_stats.txt - └── 04_assembly_qc # Driectory with QC for the assembled haplotypes (one per haplotype) - ├── hap1 - │  ├── busco - │  │  └── busco_{sample}_hap1.txt - │  ├── katplot - │  │  ├── ... - │  ├── LTR - │  │  ├── ... - │  ├── {sample}_hap1_genometools_stats.txt - │  └── telomeres - │  └── ... - ├── merqury - │  └── ... - └── meryl - └── ... +results/ +└── run_results + ├── 01_raw_assembly + │ ├── raw_hap1.gfa # Raw assembly graph for haplotype 1 + │ ├── raw_hap2.gfa # Raw assembly graph for haplotype 2 + │ ├── hap1.fasta.gz # Assembled sequence for haplotype 1 + │ ├── hap2.fasta.gz # Assembled sequence for haplotype 2 + │ └── hifiasm_benchmark.txt # Assembly performance metrics + ├── 02_final_assembly + │ ├── hap1 + │ │ ├── cutoffs # Purge_dups coverage cutoffs + │ │ ├── ragtag_scafold + │ │ │ └── recap.txt # Scaffolding summary + │ │ └── final_hap1.fasta.gz # Final processed haplotype 1 + │ └── hap2 # Similar structure for haplotype 2 + │ └── ... + ├── 03_raw_data_qc + │ ├── genomescope + │ │ ├── linear_plot.png # K-mer frequency distribution + │ │ └── log_plot.png # Log-scale k-mer distribution + │ ├── jellyfish + │ │ ├── run.histo # K-mer count histogram + │ │ └── run.jf # K-mer count database + │ └── genometools_stats.txt # Basic sequence statistics + ├── 04_assembly_qc + │ ├── hap1 + │ │ ├── busco # Completeness assessment + │ │ │ └── busco_run1_hap1.txt + │ │ ├── katplot # K-mer frequency analysis + │ │ │ ├── run1_hap1.katplot.png + │ │ │ ├── run1_hap1-main.mx.spectra-cn.png + │ │ │ └── run1_hap1.stats + │ │ ├── LTR # Transposable element analysis + │ │ │ ├── recap_run1_hap1.tbl + │ │ │ ├── run1_hap1.out.LAI + │ │ │ └── run1_hap1.scn + │ │ ├── run1_hap1_genometools_stats.txt # Assembly statistics + │ │ └── telomeres + │ │ └── run1_hap1_telomeres.txt # Telomere identification results + │ ├── merqury # Assembly quality assessment + │ │ ├── run1_merqury.completeness.stats + │ │ ├── run1_merqury.only.hist + │ │ └──run1_merqury.qv + │ ├── quast # Assembly metrics and comparison + │ │ └── report.html + └── full_qc_report.html # Complete quality control summary ``` \ No newline at end of file diff --git a/doc/software_list.md b/doc/software_list.md index 1abbb5d..5414911 100644 --- a/doc/software_list.md +++ b/doc/software_list.md @@ -1,35 +1,46 @@ -# Workflow steps and program versions -All images here will be pulled automatically by Snakemake the first time you run the workflow. It may take some time. Images are only downloaded once and reused automatically by the workflow. -Images are stored on the project's container registry : +# Workflow Steps and Program Versions + +Images are automatically pulled by Snakemake on first run and stored in the project's container registry. Note that you can modify the container registry in the `./config/masterconfig.yaml` file to add your own instead. ## 01. Assembly -- Assembly - - [hifiasm](https://github.com/chhylp123/hifiasm) 0.19.6 - - [YAK](https://github.com/lh3/yak) 0.1 -- Haplotigs and overlaps purging - - [purge_dups](https://github.com/dfguan/purge_dups) 1.2.5 - - **matplotlib** 0.11.5 -- Scafolding - - [RagTag](https://github.com/malonge/RagTag) + +Assembly: +- [hifiasm](https://github.com/chhylp123/hifiasm) 0.19.6 +- [YAK](https://github.com/lh3/yak) 0.1 + +Haplotigs and Overlaps Purging: +- [purge_dups](https://github.com/dfguan/purge_dups) 1.2.5 +- matplotlib 0.11.5 + +Scaffolding: +- [RagTag](https://github.com/malonge/RagTag) ## Quality Control -- K-mer analysis - - [jellyfish](https://github.com/gmarcais/Jellyfish) 2.3.0 - - [genomescope](https://github.com/tbenavi1/genomescope2.0) 2.0 -- Metrics - - [genometools](https://github.com/genometools/genometools) 1.5.9 -- Assembly quality control - - [BUSCO](https://gitlab.com/ezlab/busco) 5.7.1 - - [KAT](https://github.com/TGAC/KAT) 2.4.1 -- Error rate, QV & phasing - - [meryl](https://github.com/marbl/meryl) and [merqury](https://github.com/marbl/merqury) 1.3 -- Detect assembled telomeres - - [FindTelomeres](https://github.com/JanaSperschneider/FindTelomeres) - - **Biopython** 1.75 -- Repeted elements quantification - - [LTR_retriever](https://github.com/oushujun/LTR_retriever) 3.0.1 - - [LTR_Finder](https://github.com/xzhub/LTR_Finder) latest as of october 2024 -- Contig length exploration - - [QUAST](https://github.com/ablab/quast) 5.2.0 -- Report generation - - **R markdown** 4.0.3 \ No newline at end of file + +K-mer Analysis: +- [jellyfish](https://github.com/gmarcais/Jellyfish) 2.3.0 +- [genomescope](https://github.com/tbenavi1/genomescope2.0) 2.0 + +Metrics: +- [genometools](https://github.com/genometools/genometools) 1.5.9 + +Assembly Quality Control: +- [BUSCO](https://gitlab.com/ezlab/busco) 5.7.1 +- [KAT](https://github.com/TGAC/KAT) 2.4.1 + +Error Rate, QV & Phasing: +- [meryl](https://github.com/marbl/meryl) and [merqury](https://github.com/marbl/merqury) 1.3 + +Telomere Detection: +- [FindTelomeres](https://github.com/JanaSperschneider/FindTelomeres) +- Biopython 1.75 + +Repeated Elements Quantification: +- [LTR_retriever](https://github.com/oushujun/LTR_retriever) 3.0.1 +- [LTR_Finder](https://github.com/xzhub/LTR_Finder) (latest as of October 2024) + +Contig Length Analysis: +- [QUAST](https://github.com/ablab/quast) 5.2.0 + +Report Generation: +- R markdown 4.0.3 \ No newline at end of file -- GitLab From fea8ed2e845265cbbf727163b1a2e58c72c8caa5 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 16:52:46 +0100 Subject: [PATCH 149/178] update the who why when --- job.sh | 4 ++++ local_run.sh | 2 +- workflow/scripts/parameter_retrieval.py | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/job.sh b/job.sh index e456326..233ad5b 100644 --- a/job.sh +++ b/job.sh @@ -6,6 +6,10 @@ #SBATCH -J asm4pg #SBATCH --mem=10G +# Written by Lucien Piat at INRAe +# Use this script to run asm4pg on a HPC +# 07/01/25 + # Verify arguments if [ $# -ne 1 ] || [ "$1" == "help" ]; then echo "Use this script to run asm4pg localy or on a single HPC node" diff --git a/local_run.sh b/local_run.sh index f46794f..6c5a99c 100755 --- a/local_run.sh +++ b/local_run.sh @@ -8,8 +8,8 @@ #SBATCH -e slurm_logs/err_job_%j.err # Written by Lucien Piat at INRAe -# 07/01/25 # Use this script to run asm4pg localy or on a single HPC node +# 07/01/25 SNG_BIND=$(pwd) diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py index eb37b05..565eec0 100644 --- a/workflow/scripts/parameter_retrieval.py +++ b/workflow/scripts/parameter_retrieval.py @@ -1,5 +1,8 @@ from snakemake.io import expand + +# Written by Lucien Piat at INRAe # Used to retrive the parameters for rules +# 07/01/25 # Fetch the purge level for hifiasm def get_purge_force(wildcards) -> str: -- GitLab From 1a4aa0e30d4cef7fda373037c705238ff6df009c Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Tue, 14 Jan 2025 16:53:25 +0100 Subject: [PATCH 150/178] Fix Isolating QUAST output --- workflow/scripts/quast_call.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/scripts/quast_call.sh b/workflow/scripts/quast_call.sh index e7a7b87..b60a257 100755 --- a/workflow/scripts/quast_call.sh +++ b/workflow/scripts/quast_call.sh @@ -50,9 +50,9 @@ echo "Asm4pg -> Running QUAST..." eval $quast_cmd echo "Asm4pg -> Isolating QUAST output" -cp $OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.pdf $OUTPUT_DIR cumulative_plot.pdf -cp $OUTPUT_DIR/combined_reference/basic_stats/CG_content_plot.pdf $OUTPUT_DIR CG_content_plot.pdf -cp $OUTPUT_DIR/combined_reference/basic_stats/Nx_plot.pdf $OUTPUT_DIR Nx_plot.pdf +cp $OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.pdf $OUTPUT_DIR/cumulative_plot.pdf +cp $OUTPUT_DIR/combined_reference/basic_stats/CG_content_plot.pdf $OUTPUT_DIR/CG_content_plot.pdf +cp $OUTPUT_DIR/combined_reference/basic_stats/Nx_plot.pdf $OUTPUT_DIR/Nx_plot.pdf # Exit status check -- GitLab From f455cf8e44f4182b1faae7939841a847474293f8 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 11:04:50 +0100 Subject: [PATCH 151/178] Fix report generation --- workflow/Snakefile | 49 ++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index f656192..5dab9eb 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -401,58 +401,61 @@ rule quast: ragtag_hap2=lambda wildcards: os.path.join(output_dir, f"{wildcards.sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", f"{wildcards.sample}_scafold_hap2.fasta.gz"), reference_genome=get_reference, purge_bool=get_purge_bool, - ragtag_bool=get_ragtag_bool + ragtag_bool=get_ragtag_bool, + results_dir = output_dir container: f"{container_registry}/staphb/quast:5.2.0" threads: 20 resources: - mem_mb=100000, - time="10:00:00" + mem_mb=250000, + time="80:00:00" shell: """ ./workflow/scripts/quast_call.sh {params.reference_genome} {params.purge_bool} {params.ragtag_bool} \ {input.raw_hap1} {input.raw_hap2} {input.final_hap1} {input.final_hap2} \ - {params.ragtag_hap1} {params.ragtag_hap2} {output.quast_output} + {params.ragtag_hap1} {params.ragtag_hap2} {output.quast_output} {params.results_dir} """ +pwd = config.get("output_dir_pdw", os.getcwd()) # NOT TESTED # Rule to generate the html report rule generate_report: input: - genomescope = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), - genomescope_sum = rules.genomescope.output.summary, - genometools_on_raw_data = rules.genometools_on_raw_data.output, + genomescope = os.path.join(pwd, output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), + genomescope_sum = os.path.join(pwd, output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "summary.txt"), + genometools_on_raw_data = os.path.join(pwd, output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), - genometools_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), - genometools_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), + genometools_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), + genometools_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), - busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt"), - busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt"), + busco_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt"), + busco_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt"), - kplot_hap1 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), - kplot_hap2 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), + kplot_hap1 =os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), + kplot_hap2 =os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), - telomeres_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "telomeres", "{sample}_hap1_telomeres.txt"), - telomeres_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "telomeres", "{sample}_hap2_telomeres.txt"), + telomeres_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "telomeres", "{sample}_hap1_telomeres.txt"), + telomeres_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "telomeres", "{sample}_hap2_telomeres.txt"), - LAI_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "{sample}_hap1.out.LAI"), - LAI_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "{sample}_hap2.out.LAI"), + LAI_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "{sample}_hap1.out.LAI"), + LAI_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "{sample}_hap2.out.LAI"), - LRT_recap_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "recap_{sample}_hap1.tbl"), - LRT_recap_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "recap_{sample}_hap2.tbl"), + LRT_recap_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "recap_{sample}_hap1.tbl"), + LRT_recap_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "recap_{sample}_hap2.tbl"), - merqury_stats=rules.merqury.output.stats, - merqury_qv=rules.merqury.output.qv + merqury_stats=os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats"), + merqury_qv=os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.qv"), output: os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html") params: sample = "{sample}", mode = get_mode, assembly_purge_force = get_purge_force, - run_purge_dups = get_purge_bool, + purge_dups_bool = get_purge_bool, busco_lineage = get_busco_lin, ploidy = get_ploidy, kmer_size = get_kmer_size, + ragtag_bool = get_ragtag_bool, r1 = lambda wildcards: get_run(wildcards, run=1), r2 = lambda wildcards: get_run(wildcards, run=2) resources: @@ -461,4 +464,4 @@ rule generate_report: container: f"{container_registry}/rmarkdown4.0.3" script: - "./workflow/scripts/report.Rmd" \ No newline at end of file + "./scripts/report.Rmd" \ No newline at end of file -- GitLab From e1dee80b99a0fb5acc8ecba8d93c3822666e6ed8 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 11:11:51 +0100 Subject: [PATCH 152/178] update the report --- workflow/scripts/report.Rmd | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) mode change 100644 => 100755 workflow/scripts/report.Rmd diff --git a/workflow/scripts/report.Rmd b/workflow/scripts/report.Rmd old mode 100644 new mode 100755 index 78e48c3..629f97e --- a/workflow/scripts/report.Rmd +++ b/workflow/scripts/report.Rmd @@ -19,19 +19,20 @@ output: ---- -# `r snakemake@params[["sample"]]` +# Global Asm4pg assembly report for: `r snakemake@params[["sample"]]` +## Asm4pg Parameters * Hifiasm mode : `r snakemake@params[["mode"]]` -* Hifiasm purge mode : `r snakemake@params[["assembly_purge_force"]]` -* Purge conducted: `r if (snakemake@params[["run_purge_dups"]]) { "Yes" } else { "No" }` -* Busco lineage: `r snakemake@params[["busco_lineage"]]` -* Genome ploidy: `r snakemake@params[["ploidy"]]` -* Kmers size: `r snakemake@params[["kmer_size"]]` - * If hi-c or trio mode : * parent1/r1 `r snakemake@params[["r1"]]` * parent2/r2 `r snakemake@params[["r2"]]` ----- +* Hifiasm purge force : `r snakemake@params[["assembly_purge_force"]]` +* Purge_dups executed: `r if (snakemake@params[["purge_dups_bool"]]) { "Yes" } else { "No" }` +* Genome scafolded: `r if (snakemake@params[["ragtag_bool"]]) { "Yes" } else { "No" }` + +* Busco lineage: `r snakemake@params[["busco_lineage"]]` +* Genome ploidy: `r snakemake@params[["ploidy"]]` +* Kmers size: `r snakemake@params[["kmer_size"]]` ## Raw data QC  @@ -42,7 +43,7 @@ cat(readLines(snakemake@input[["genometools_on_raw_data"]]), sep = '\n') ``` -## Assembly QC - Hifiasm +## QC on final assembly ### Assembly statistics #### Hap 1 ```{r comment='', echo=FALSE} @@ -71,7 +72,7 @@ Error rate cat(readLines(snakemake@input[["merqury_qv"]]), sep = '\n') ``` -### BUSCO +### BUSCO score #### Hap 1 ```{r comment='', echo=FALSE} cat(readLines(snakemake@input[["busco_hap1"]]), sep = '\n') @@ -83,7 +84,6 @@ cat(readLines(snakemake@input[["busco_hap2"]]), sep = '\n') ### Telomeres Telomeres present in assembly - #### Hap 1 ```{r comment='', echo=FALSE} cat(readLines(snakemake@input[["telomeres_hap1"]]), sep = '\n') @@ -93,7 +93,7 @@ cat(readLines(snakemake@input[["telomeres_hap1"]]), sep = '\n') cat(readLines(snakemake@input[["telomeres_hap2"]]), sep = '\n') ``` -### LTR Assembly Index (LAI) +### Transposable element analysis #### Hap 1 LTR recap ```{r comment='', echo=FALSE} -- GitLab From 197e60e6819f3dfe17255651b93e20319345fddd Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 11:15:35 +0100 Subject: [PATCH 153/178] Add files to result dir --- workflow/scripts/quast_call.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/scripts/quast_call.sh b/workflow/scripts/quast_call.sh index b60a257..cc916e6 100755 --- a/workflow/scripts/quast_call.sh +++ b/workflow/scripts/quast_call.sh @@ -15,6 +15,7 @@ FINAL_HAP2="$7" RAGTAG_HAP1="$8" RAGTAG_HAP2="$9" OUTPUT_DIR="${10}" +RESULT_DIR="$11" # Create the list of genomes to run quast on echo "Asm4pg -> Preparing genome list for QUAST analysis..." @@ -51,10 +52,9 @@ eval $quast_cmd echo "Asm4pg -> Isolating QUAST output" cp $OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.pdf $OUTPUT_DIR/cumulative_plot.pdf -cp $OUTPUT_DIR/combined_reference/basic_stats/CG_content_plot.pdf $OUTPUT_DIR/CG_content_plot.pdf +cp $OUTPUT_DIR/combined_reference/basic_stats/GC_content_plot.pdf $OUTPUT_DIR/GC_content_plot.pdf cp $OUTPUT_DIR/combined_reference/basic_stats/Nx_plot.pdf $OUTPUT_DIR/Nx_plot.pdf - - +cp $OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.pdf $RESULT_DIR/cumulative_plot.pdf # Exit status check if [ $? -eq 0 ]; then echo "Asm4pg -> QUAST completed successfully." -- GitLab From ae632b607ab1e27e0294578625467d0e59276485 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 11:18:57 +0100 Subject: [PATCH 154/178] Add minimal configuration example --- .config/masterconfig.yaml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index bbced5e..9bb5dc1 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,18 +1,15 @@ # Config file +# Add the assembly runs you want to do under samples. +# Here is an example of mininmal configuration. Refer to the documentation for all options + samples: run1: fasta_gz: small_example.fasta.gz - -container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" +# You can modify the output path if you need to output_dir: "results/" +output_dir_pdw: /mnt/cbib/pangenoak_trials/GenomAsm4pg/ -# run2: -# fasta_gz: small_example.fasta.gz -# assembly_purge_force: 2 -# mode: default -# run_purge_dups: True -# busco_lineage: eudicots_odb10 -# ploidy: 2 -# kmer_size: 21 \ No newline at end of file +# This container registry will be used to download singularity images +container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" \ No newline at end of file -- GitLab From fba5824cd054f057f45503cd5965177860ed5931 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 11:20:38 +0100 Subject: [PATCH 155/178] print to stderr instead of stdout --- workflow/scripts/parameter_retrieval.py | 46 ++++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/workflow/scripts/parameter_retrieval.py b/workflow/scripts/parameter_retrieval.py index 565eec0..6e5bb95 100644 --- a/workflow/scripts/parameter_retrieval.py +++ b/workflow/scripts/parameter_retrieval.py @@ -1,79 +1,85 @@ from snakemake.io import expand +import sys # Written by Lucien Piat at INRAe -# Used to retrive the parameters for rules +# Used to retrieve the parameters for rules # 07/01/25 # Fetch the purge level for hifiasm def get_purge_force(wildcards) -> str: - try : + try: force = config["samples"][wildcards.sample]["assembly_purge_force"] except KeyError: - print('Asm4pg -> "assembly_purge_force" unspecified for ' + wildcards.sample + ', using l3 by default') + print(f'Asm4pg -> "assembly_purge_force" unspecified for {wildcards.sample}, using l3 by default', file=sys.stderr) return '3' return force # Fetch the mode for hifiasm def get_mode(wildcards) -> str: - try : + try: mode = config["samples"][wildcards.sample]["mode"] except KeyError: - print('Asm4pg -> "mode" unspecified for ' + wildcards.sample + ', using default assembly mode for hifiasm') + print(f'Asm4pg -> "mode" unspecified for {wildcards.sample}, using default assembly mode for hifiasm', file=sys.stderr) return 'default' return mode # Fetch r1/r2 fasta file for hi-c -def get_run(wildcards, run:int) -> str: - try : - run= config["samples"][wildcards.sample][f"r{run}"] +def get_run(wildcards, run: int) -> str: + try: + run = config["samples"][wildcards.sample][f"r{run}"] except KeyError: return 'None' return run # Fetch the purge mode, return a boolean from config file def get_purge_bool(wildcards) -> bool: - try : + try: purge_bool = config["samples"][wildcards.sample]["run_purge_dups"] except KeyError: - print('Asm4pg -> "run_purge_dups" unspecified for ' + wildcards.sample + ', using "False" by default') + print(f'Asm4pg -> "run_purge_dups" unspecified for {wildcards.sample}, using "False" by default', file=sys.stderr) return False return purge_bool +# Fetch the BUSCO lineage def get_busco_lin(wildcards) -> str: - try : + try: lin = config["samples"][wildcards.sample]["busco_lineage"] except KeyError: - print('Asm4pg -> "busco_lineage" unspecified for ' + wildcards.sample + ', using "eukaryota_odb10" by default') + print(f'Asm4pg -> "busco_lineage" unspecified for {wildcards.sample}, using "eukaryota_odb10" by default', file=sys.stderr) return "eukaryota_odb10" return lin +# Fetch the genome ploidy def get_ploidy(wildcards) -> int: - try : + try: ploidy = config["samples"][wildcards.sample]["ploidy"] except KeyError: - print('Asm4pg -> "ploidy" unspecified for ' + wildcards.sample + ', using 2 by default') + print(f'Asm4pg -> "ploidy" unspecified for {wildcards.sample}, using 2 by default', file=sys.stderr) return 2 return ploidy +# Fetch the k-mer size def get_kmer_size(wildcards) -> int: - try : + try: size = config["samples"][wildcards.sample]["kmer_size"] except KeyError: - print('Asm4pg -> "kmer_size" unspecified for ' + wildcards.sample + ', using 21 by default') + print(f'Asm4pg -> "kmer_size" unspecified for {wildcards.sample}, using 21 by default', file=sys.stderr) return 21 return size +# Fetch the reference genome def get_reference(wildcards) -> str: - try : + try: reference_genome = config["samples"][wildcards.sample]["reference_genome"] except KeyError: return 'None' return reference_genome +# Fetch whether to run RagTag def get_ragtag_bool(wildcards) -> bool: - try : + try: ragtag_bool = config["samples"][wildcards.sample]["run_ragtag"] except KeyError: - print('Asm4pg -> "run_ragtag" unspecified for ' + wildcards.sample + ', using "False" by default') + print(f'Asm4pg -> "run_ragtag" unspecified for {wildcards.sample}, using "False" by default', file=sys.stderr) return False - return ragtag_bool \ No newline at end of file + return ragtag_bool -- GitLab From 3a5a7fe3197705aa9fcdd3fb307f35e186b11ca9 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 11:26:56 +0100 Subject: [PATCH 156/178] Include the pwd to paths --- workflow/Snakefile | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 5dab9eb..95d7515 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -10,9 +10,9 @@ import os import yaml container_registry = config.get("container_registry", "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg") -output_dir = config.get("output_dir", "results/") +output_dir = os.path.join(config.get("output_dir_pdw", os.getcwd()), config.get("output_dir", "results/")) + -assembly_qc_folder = os.path.join(output_dir, "{sample}_results", "04_assembly_qc") rule all: input: @@ -416,35 +416,35 @@ rule quast: {params.ragtag_hap1} {params.ragtag_hap2} {output.quast_output} {params.results_dir} """ -pwd = config.get("output_dir_pdw", os.getcwd()) + # NOT TESTED # Rule to generate the html report rule generate_report: input: - genomescope = os.path.join(pwd, output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), - genomescope_sum = os.path.join(pwd, output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "summary.txt"), - genometools_on_raw_data = os.path.join(pwd, output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), + genomescope = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "linear_plot.png"), + genomescope_sum = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope", "summary.txt"), + genometools_on_raw_data = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "{sample}_genometools_stats.txt"), - genometools_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), - genometools_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), + genometools_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "{sample}_hap1_genometools_stats.txt"), + genometools_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "{sample}_hap2_genometools_stats.txt"), - busco_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt"), - busco_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt"), + busco_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt"), + busco_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt"), - kplot_hap1 =os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), - kplot_hap2 =os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), + kplot_hap1 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "katplot", "{sample}_hap1.katplot.png"), + kplot_hap2 =os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "katplot", "{sample}_hap2.katplot.png"), - telomeres_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "telomeres", "{sample}_hap1_telomeres.txt"), - telomeres_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "telomeres", "{sample}_hap2_telomeres.txt"), + telomeres_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "telomeres", "{sample}_hap1_telomeres.txt"), + telomeres_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "telomeres", "{sample}_hap2_telomeres.txt"), - LAI_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "{sample}_hap1.out.LAI"), - LAI_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "{sample}_hap2.out.LAI"), + LAI_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "{sample}_hap1.out.LAI"), + LAI_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "{sample}_hap2.out.LAI"), - LRT_recap_hap1 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "recap_{sample}_hap1.tbl"), - LRT_recap_hap2 = os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "recap_{sample}_hap2.tbl"), + LRT_recap_hap1 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "LTR", "recap_{sample}_hap1.tbl"), + LRT_recap_hap2 = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "LTR", "recap_{sample}_hap2.tbl"), - merqury_stats=os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats"), - merqury_qv=os.path.join(pwd, output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.qv"), + merqury_stats=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats"), + merqury_qv=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.qv"), output: os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html") params: -- GitLab From 125c72d55ee35a13b5b0d2692d64055b24485714 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 17:27:50 +0100 Subject: [PATCH 157/178] Save the recap at the end --- workflow/scripts/ragtag_call.sh | 39 +++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/workflow/scripts/ragtag_call.sh b/workflow/scripts/ragtag_call.sh index 2ec8f1d..4892645 100755 --- a/workflow/scripts/ragtag_call.sh +++ b/workflow/scripts/ragtag_call.sh @@ -12,30 +12,35 @@ HAP_IN=$5 HAP_OUT=$6 RECAP=$7 -# Echo parameters into the recap file -echo "RAGTAG: $RAGTAG" > $RECAP -echo "DIRR: $DIRR" >> $RECAP -echo "THREADS: $THREADS" >> $RECAP -echo "REF: $REF" >> $RECAP -echo "HAP_IN: $HAP_IN" >> $RECAP -echo "HAP_OUT: $HAP_OUT" >> $RECAP +# Initialize recap content +RECAP_CONTENT="RAGTAG: $RAGTAG +DIRR: $DIRR +THREADS: $THREADS +REF: $REF +HAP_IN: $HAP_IN +HAP_OUT: $HAP_OUT +" if [[ "$RAGTAG" == "True" || "$RAGTAG" == "true" ]]; then echo "Asm4pg -> Running ragtag" - echo "Asm4pg -> Ragtag execution started" >> $RECAP - mkdir -p $DIRR + RECAP_CONTENT+="Asm4pg -> Ragtag execution started\n" + mkdir -p "$DIRR" - if ragtag.py scaffold -o $DIRR -t $THREADS $REF $HAP_IN; then - gzip $DIRR/ragtag.scaffold.fasta - mv $DIRR/ragtag.scaffold.fasta.gz $HAP_OUT - echo "Asm4pg -> Ragtag execution completed" >> $RECAP - echo "Output file: $HAP_OUT" >> $RECAP + if ragtag.py scaffold -o "$DIRR" -t "$THREADS" "$REF" "$HAP_IN"; then + gzip "$DIRR/ragtag.scaffold.fasta" + mv "$DIRR/ragtag.scaffold.fasta.gz" "$HAP_OUT" + RECAP_CONTENT+="Asm4pg -> Ragtag execution completed\n" + RECAP_CONTENT+="Output file: $HAP_OUT\n" else - echo "Asm4pg -> Ragtag execution failed" >> $RECAP + RECAP_CONTENT+="Asm4pg -> Ragtag execution failed\n" + echo -e "$RECAP_CONTENT" > "$RECAP" exit 1 fi else echo "Asm4pg -> Ragtag option is off" - echo "Asm4pg -> Ragtag option is off" >> $RECAP - mkdir -p $DIRR + RECAP_CONTENT+="Asm4pg -> Ragtag option is off\n" + mkdir -p "$DIRR" fi + +# Write recap content to file at the end +echo -e "$RECAP_CONTENT" > "$RECAP" -- GitLab From 46d6f1201e801ab45841063eddf7dc5e8dac6d78 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 17:28:45 +0100 Subject: [PATCH 158/178] Ignore useless files --- workflow/scripts/quast_call.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/workflow/scripts/quast_call.sh b/workflow/scripts/quast_call.sh index cc916e6..f96ae4c 100755 --- a/workflow/scripts/quast_call.sh +++ b/workflow/scripts/quast_call.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Script to dynamically run quast on produced genomes, with verbose output +# Script to dynamically run quast on produced genomes, with focus on basic stat plots in PNG format # Author: Lucien PIAT # For: Project Pangenoak # Date: January 6, 2025 @@ -34,7 +34,7 @@ fi # Build the quast command echo "Asm4pg -> Building the QUAST command..." -quast_cmd="python /quast-5.2.0/metaquast.py --threads 20 --large " +quast_cmd="python /quast-5.2.0/metaquast.py --threads 20 --large --no-html --no-check --plots-format png " if [ "$REFERENCE_GENOME" != "None" ]; then echo " - Reference genome specified: $REFERENCE_GENOME" quast_cmd+="--reference $REFERENCE_GENOME " @@ -50,11 +50,13 @@ echo "$quast_cmd" echo "Asm4pg -> Running QUAST..." eval $quast_cmd -echo "Asm4pg -> Isolating QUAST output" -cp $OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.pdf $OUTPUT_DIR/cumulative_plot.pdf -cp $OUTPUT_DIR/combined_reference/basic_stats/GC_content_plot.pdf $OUTPUT_DIR/GC_content_plot.pdf -cp $OUTPUT_DIR/combined_reference/basic_stats/Nx_plot.pdf $OUTPUT_DIR/Nx_plot.pdf -cp $OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.pdf $RESULT_DIR/cumulative_plot.pdf +# Isolating desired outputs +echo "Asm4pg -> Isolating QUAST basic stat plots..." +mkdir -p "$RESULT_DIR" +cp "$OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.png" "$RESULT_DIR/cumulative_plot.png" +cp "$OUTPUT_DIR/combined_reference/basic_stats/GC_content_plot.png" "$RESULT_DIR/GC_content_plot.png" +cp "$OUTPUT_DIR/combined_reference/basic_stats/Nx_plot.png" "$RESULT_DIR/Nx_plot.png" + # Exit status check if [ $? -eq 0 ]; then echo "Asm4pg -> QUAST completed successfully." -- GitLab From 002930269f31ddc75842219638fa7dcc31d0c12a Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Wed, 15 Jan 2025 17:29:53 +0100 Subject: [PATCH 159/178] Lint the std echos --- workflow/scripts/hifiasm_call.sh | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/workflow/scripts/hifiasm_call.sh b/workflow/scripts/hifiasm_call.sh index c09e2aa..03a9ff4 100755 --- a/workflow/scripts/hifiasm_call.sh +++ b/workflow/scripts/hifiasm_call.sh @@ -15,27 +15,29 @@ RUN_2=$6 PREFIX=$7 echo "Asm4pg -> Given hifiasm parameters :" -echo "MODE: $MODE" -echo "PURGE_FORCE: $PURGE_FORCE" -echo "THREADS: $THREADS" -echo "INPUT: $INPUT" -echo "RUN_1: $RUN_1" -echo "RUN_2: $RUN_2" -echo "PREFIX: $PREFIX" - +echo " MODE: $MODE" +echo " PURGE_FORCE: $PURGE_FORCE" +echo " THREADS: $THREADS" +echo " INPUT: $INPUT" +echo " RUN_1: $RUN_1" +echo " RUN_2: $RUN_2" +echo " PREFIX: $PREFIX" +echo "Asm4pg -> Constructing hifiasm command" # Run the appropriate hifiasm command based on the mode case "$MODE" in default) echo "Asm4pg -> Running hifiasm in default mode..." hifiasm -l${PURGE_FORCE} -o ${PREFIX} -t ${THREADS} ${INPUT} + echo "Asm4pg -> Hifiasm assembly done" ;; hi-c) echo "Asm4pg -> Running hifiasm in hi-c mode..." hifiasm -l${PURGE_FORCE} -o ${PREFIX} -t ${THREADS} --h1 ${RUN_1} --h2 ${RUN_2} ${INPUT} echo "Asm4pg -> Renaming hifiasm output files" mv ${PREFIX}.hic.hap1.p_ctg.gfa ${PREFIX}.bp.hap1.p_ctg.gfa - mv ${PREFIX}.hic.hap2.p_ctg.gfa ${PREFIX}.bp.hap2.p_ctg.gfa + mv ${PREFIX}.hic.hap2.p_ctg.gfa ${PREFIX}.bp.hap2.p_ctg.gfa + echo "Asm4pg -> Hifiasm assembly done" ;; trio) echo "Asm4pg -> Hifiasm called in trio mode..." @@ -48,6 +50,7 @@ case "$MODE" in echo "Asm4pg -> Renaming hifiasm output files" mv ${PREFIX}.dip.hap1.p_ctg.gfa ${PREFIX}.bp.hap1.p_ctg.gfa mv ${PREFIX}.dip.hap2.p_ctg.gfa ${PREFIX}.bp.hap2.p_ctg.gfa + echo "Asm4pg -> Hifiasm assembly done" ;; *) echo "Asm4pg -> Unknown hifiasm mode: $MODE" -- GitLab From a62819b1629af7840d061754e0d27eae36ef1f75 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 16 Jan 2025 10:45:40 +0100 Subject: [PATCH 160/178] Remove commented unlock --- local_run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/local_run.sh b/local_run.sh index 6c5a99c..5a214d8 100755 --- a/local_run.sh +++ b/local_run.sh @@ -2,8 +2,8 @@ ## TMP config to run on the CBIB #SBATCH --job-name=asm4pg -#SBATCH --ntasks=20 -#SBATCH --mem=200G +#SBATCH --ntasks=25 +#SBATCH --mem=300G #SBATCH -o slurm_logs/out_job_%j.out #SBATCH -e slurm_logs/err_job_%j.err @@ -42,7 +42,7 @@ run_snakemake() { snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) --unlock ;; run) - snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) #--unlock + snakemake --use-singularity --singularity-args "-B $SNG_BIND" -j $(nproc) ;; *) echo "Invalid option: $option" -- GitLab From 4361afae934d4e45d72a576a60d8af448e8ed224 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Thu, 16 Jan 2025 11:44:01 +0100 Subject: [PATCH 161/178] Include quast plots in report --- workflow/Snakefile | 33 +++++++++++++-------------------- workflow/scripts/quast_call.sh | 29 +++++++---------------------- workflow/scripts/report.Rmd | 6 ++++++ 3 files changed, 26 insertions(+), 42 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 95d7515..033449c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -28,10 +28,6 @@ rule all: expand( os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html"), sample=config["samples"].keys() - ), - expand( - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast"), - sample=config["samples"].keys() ) # Genome assembly using hifiasm @@ -388,21 +384,18 @@ rule scafolding: # Rule to create a quast report assessing the quality of all assemblies rule quast: input: - raw_hap1=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fasta.gz"), - raw_hap2=os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fasta.gz"), - final_hap1=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), - final_hap2=os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), - scafold1 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", "recap.txt"), - scafold2 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", "recap.txt") + raw_hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap1.fasta.gz"), + raw_hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hap2.fasta.gz"), + final_hap1 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap1", "{sample}_final_hap1.fasta.gz"), + final_hap2 = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap2", "{sample}_final_hap2.fasta.gz"), output: - quast_output=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") + cumulative_plot = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast", "combined_reference", "basic_stats", "cumulative_plot.png"), + Nx_plot = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast", "combined_reference", "basic_stats", "Nx_plot.png") params: - ragtag_hap1=lambda wildcards: os.path.join(output_dir, f"{wildcards.sample}_results", "02_final_assembly", "hap1", "ragtag_scafold", f"{wildcards.sample}_scafold_hap1.fasta.gz"), - ragtag_hap2=lambda wildcards: os.path.join(output_dir, f"{wildcards.sample}_results", "02_final_assembly", "hap2", "ragtag_scafold", f"{wildcards.sample}_scafold_hap2.fasta.gz"), reference_genome=get_reference, purge_bool=get_purge_bool, - ragtag_bool=get_ragtag_bool, - results_dir = output_dir + results_dir = output_dir, + quast_output = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast") container: f"{container_registry}/staphb/quast:5.2.0" threads: 20 @@ -411,12 +404,9 @@ rule quast: time="80:00:00" shell: """ - ./workflow/scripts/quast_call.sh {params.reference_genome} {params.purge_bool} {params.ragtag_bool} \ - {input.raw_hap1} {input.raw_hap2} {input.final_hap1} {input.final_hap2} \ - {params.ragtag_hap1} {params.ragtag_hap2} {output.quast_output} {params.results_dir} + ./workflow/scripts/quast_call.sh {params.reference_genome} {params.purge_bool} {input.raw_hap1} {input.raw_hap2} {input.final_hap1} {input.final_hap2} {params.quast_output} {params.results_dir} """ - - + # NOT TESTED # Rule to generate the html report rule generate_report: @@ -445,6 +435,9 @@ rule generate_report: merqury_stats=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.completeness.stats"), merqury_qv=os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury", "{sample}_merqury.qv"), + + cumulative_plot = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast", "combined_reference", "basic_stats", "cumulative_plot.png"), + Nx_plot = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "quast", "combined_reference", "basic_stats", "Nx_plot.png") output: os.path.join(output_dir, "{sample}_results", "{sample}_assembly_report.html") params: diff --git a/workflow/scripts/quast_call.sh b/workflow/scripts/quast_call.sh index f96ae4c..5da4467 100755 --- a/workflow/scripts/quast_call.sh +++ b/workflow/scripts/quast_call.sh @@ -7,15 +7,12 @@ # Arguments REFERENCE_GENOME="$1" PURGE_BOOL="$2" -RAGTAG_BOOL="$3" -RAW_HAP1="$4" -RAW_HAP2="$5" -FINAL_HAP1="$6" -FINAL_HAP2="$7" -RAGTAG_HAP1="$8" -RAGTAG_HAP2="$9" -OUTPUT_DIR="${10}" -RESULT_DIR="$11" +RAW_HAP1="$3" +RAW_HAP2="$4" +FINAL_HAP1="$5" +FINAL_HAP2="$6" +OUTPUT_DIR="${7}" +RESULT_DIR="$8" # Create the list of genomes to run quast on echo "Asm4pg -> Preparing genome list for QUAST analysis..." @@ -27,14 +24,9 @@ if [ "$PURGE_BOOL" == "True" ]; then echo " - Purge option enabled: added raw haplotypes: $RAW_HAP1, $RAW_HAP2" fi -if [ "$RAGTAG_BOOL" == "True" ]; then - genomes+=("$RAGTAG_HAP1" "$RAGTAG_HAP2") - echo " - RagTag option enabled: added RagTag haplotypes: $RAGTAG_HAP1, $RAGTAG_HAP2" -fi - # Build the quast command echo "Asm4pg -> Building the QUAST command..." -quast_cmd="python /quast-5.2.0/metaquast.py --threads 20 --large --no-html --no-check --plots-format png " +quast_cmd="python /quast-5.2.0/metaquast.py --threads 20 --large --no-check --no-snps --no-icarus --plots-format png " if [ "$REFERENCE_GENOME" != "None" ]; then echo " - Reference genome specified: $REFERENCE_GENOME" quast_cmd+="--reference $REFERENCE_GENOME " @@ -50,13 +42,6 @@ echo "$quast_cmd" echo "Asm4pg -> Running QUAST..." eval $quast_cmd -# Isolating desired outputs -echo "Asm4pg -> Isolating QUAST basic stat plots..." -mkdir -p "$RESULT_DIR" -cp "$OUTPUT_DIR/combined_reference/basic_stats/cumulative_plot.png" "$RESULT_DIR/cumulative_plot.png" -cp "$OUTPUT_DIR/combined_reference/basic_stats/GC_content_plot.png" "$RESULT_DIR/GC_content_plot.png" -cp "$OUTPUT_DIR/combined_reference/basic_stats/Nx_plot.png" "$RESULT_DIR/Nx_plot.png" - # Exit status check if [ $? -eq 0 ]; then echo "Asm4pg -> QUAST completed successfully." diff --git a/workflow/scripts/report.Rmd b/workflow/scripts/report.Rmd index 629f97e..21a0b36 100755 --- a/workflow/scripts/report.Rmd +++ b/workflow/scripts/report.Rmd @@ -42,6 +42,12 @@ output: cat(readLines(snakemake@input[["genometools_on_raw_data"]]), sep = '\n') ``` +## Global assembly QC +QUAST Cumulative Plot + + +QUAST Nx Plot + ## QC on final assembly ### Assembly statistics -- GitLab From df535c519acb7db4cc7ec84c1d30a10c6147acd2 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Thu, 16 Jan 2025 11:49:34 +0100 Subject: [PATCH 162/178] add verbose --- workflow/scripts/haplotigs_handling.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/workflow/scripts/haplotigs_handling.sh b/workflow/scripts/haplotigs_handling.sh index cd1c2ba..a171709 100755 --- a/workflow/scripts/haplotigs_handling.sh +++ b/workflow/scripts/haplotigs_handling.sh @@ -13,6 +13,7 @@ PREFIX=$4 READS=$5 DIRR=$6 +echo "Asm4pg -> Starting haplotigs handling" if [[ "$PURGE_DUPS" == "True" || "$PURGE_DUPS" == "true" ]]; then # Run purge_dups on both haplotypes @@ -20,8 +21,10 @@ if [[ "$PURGE_DUPS" == "True" || "$PURGE_DUPS" == "true" ]]; then echo "Asm4pg -> Running purge_dups on haplotigs..." # Create calcutus stats for purge_dups + echo "Asm4pg -> Runing minimap2" minimap2 -xasm20 $HAP_IN $READS | gzip -c - > $DIRR/$PREFIX.paf.gz pbcstat $DIRR/$PREFIX.paf.gz -O $DIRR + calcuts $DIRR/PB.stat > $DIRR/cutoffs 2> $DIRR/calcuts.log # Split assembly & self-self alignment @@ -29,6 +32,7 @@ if [[ "$PURGE_DUPS" == "True" || "$PURGE_DUPS" == "true" ]]; then minimap2 -xasm5 -DP $DIRR/$PREFIX.split $DIRR/$PREFIX.split| gzip -c - > $DIRR/$PREFIX.split.self.paf.gz # Purge haplotigs & overlaps + echo "Asm4pg -> Starting purge_dups" purge_dups -2 -T $DIRR/cutoffs -c $DIRR/PB.base.cov $DIRR/$PREFIX.split.self.paf.gz > $DIRR/dups.bed 2> $DIRR/purge_dups.log # Get purged primary and haplotig sequences from draft assembly @@ -36,10 +40,10 @@ if [[ "$PURGE_DUPS" == "True" || "$PURGE_DUPS" == "true" ]]; then rm $DIRR/dups.bed rm $DIRR/PB.base.cov - rm $DIRR/PB.cov rm $DIRR/*paf.gz rm $DIRR/*split* - mv $DIRR/$PREFIX.purged.fa $HAP_OUT + gzip $DIRR/$PREFIX.purged.fa $HAP_OUT + mv $DIRR/$PREFIX.purged.fa.gz $HAP_OUT rm $DIRR/$PREFIX.hap.fa else @@ -51,4 +55,4 @@ else # Add an empty cutoffs file so snakemake can link the rules echo "No cutoffs, purge_dups is turned off" > $DIRR/cutoffs fi - +echo "Asm4pg -> Done with haplotigs handling" -- GitLab From 1457b6fb34796bc8d00af8900790b341d9a970b6 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Thu, 16 Jan 2025 11:52:48 +0100 Subject: [PATCH 163/178] update hpc section --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e899790..d88229a 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,8 @@ git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git && cd GenomAsm4pg ### 3. Run the workflow #### <ins>A. On a HPC</ins> -- Edit `job.sh` with your email and add path to the needed modules (`Singularity/Apptainer`, `Miniforge`) -- Provide the environement you created in `job.sh`, under `source activate wf_env`, you can create it like this : +- Edit `job.sh` with path to the modules `Singularity/Apptainer`, `Miniforge` +- Provide and environement with `Snakemake` and `snakemake-executor-plugin-slurmin` in `job.sh`, under `source activate wf_env`, you can create it like this : ```bash conda create -n wf_env -c conda-forge -c bioconda snakemake=8.4.7 snakemake-executor-plugin-slurm ``` -- GitLab From 9c1d37a69669613defb36a6e23031990d74aef46 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Thu, 16 Jan 2025 11:53:16 +0100 Subject: [PATCH 164/178] add testing config --- .config/masterconfig.yaml | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 9bb5dc1..1160fb9 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -4,8 +4,32 @@ # Here is an example of mininmal configuration. Refer to the documentation for all options samples: - run1: + run2: fasta_gz: small_example.fasta.gz + mode: default + run_purge_dups: True + assembly_purge_force: 2 + busco_lineage: eudicots_odb10 + reference_genome: small_ref.fasta.gz + run_ragtag: True +# run3: +# fasta_gz: small_example.fasta.gz +# mode: hi-c +# r1: small_example.fasta.gz +# r2: small_example.fasta.gz +# run4: +# fasta_gz: small_example.fasta.gz +# mode: hi-c +# r1: small_example.fasta.gz +# r2: small_example.fasta.gz +# run_ragtag: True +# run_purge_dups: True +# reference_genome: small_ref.fasta.gz +# run1: +# fasta_gz: small_example.fasta.gz +# mode: default +# run_purge_dups: False + # You can modify the output path if you need to output_dir: "results/" -- GitLab From 1916f3c72bd7e667e20ab475bd5975961c04b581 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Thu, 16 Jan 2025 16:12:53 +0100 Subject: [PATCH 165/178] fix busco error --- workflow/Snakefile | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 033449c..a729d10 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -162,15 +162,15 @@ use rule genometools_on_raw_data as genometools_on_assembly with: priority: 0 # BUSCO stats on assembly -rule busco: +rule busco_hap1: input: - rules.unpigz_to_fasta.output + hap = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap1","{sample}_final_hap1.fasta") output: - os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco", "busco_{sample}_hap{n}.txt") + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt") params: - prefix = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "busco"), + prefix = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco"), lineage=get_busco_lin, - sample="{sample}_hap{n}" + sample="{sample}_hap1" threads: 20 resources: mem_mb=100000, @@ -179,12 +179,23 @@ rule busco: f"{container_registry}/busco:5.7.1" shell: """ - busco -f -i {input} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads} && + busco -f -i {input.hap} -l {params.lineage} --out_path {params.prefix} -o {params.sample} -m genome -c {threads} && echo "Aasm4pg -> cleaning busco output files" && mv {params.prefix}/{params.sample}/short_summary.specific.{params.lineage}.{params.sample}.txt {output} && rm -rf {params.prefix}/{params.sample} """ +use rule busco_hap1 as busco_hap2 with: + input: + hap = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap2","{sample}_final_hap2.fasta"), + other = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt") + output: + os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt") + params: + prefix = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco"), + lineage=get_busco_lin, + sample="{sample}_hap2" + # Estimate telomeric region content rule find_telomeres: input: @@ -269,7 +280,8 @@ rule meryl: input: lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"] output: - temp(directory(os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "meryl", "{sample}_reads-db.meryl"))) + db = temp(directory(os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "meryl", "{sample}_reads-db.meryl"))), + output_dir = temp(directory(os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "meryl"))) params: km_size = get_kmer_size threads: 20 @@ -279,7 +291,7 @@ rule meryl: container: f"{container_registry}/merqury1.3" shell: - "meryl k={params.km_size} threads={threads} count {input} output {output}" + "meryl k={params.km_size} threads={threads} count {input} output {output.db}" # Calculates metrics like QV and completeness, providing a quantitative assessment of the genome assembly. rule merqury: -- GitLab From 98583ac875181b432a1cf335d7c67edc2c6c2f3d Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Thu, 16 Jan 2025 16:23:31 +0100 Subject: [PATCH 166/178] add more time --- workflow/Snakefile | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index a729d10..51c0986 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -12,8 +12,6 @@ import yaml container_registry = config.get("container_registry", "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg") output_dir = os.path.join(config.get("output_dir_pdw", os.getcwd()), config.get("output_dir", "results/")) - - rule all: input: # Required final assemblies and report @@ -73,7 +71,7 @@ rule pigz_gfa_to_fasta: threads: 4 resources: mem_mb=25000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/pigz" shell: @@ -115,7 +113,7 @@ rule unpigz_to_fasta: threads: 4 resources: mem_mb=25000, - time="10:00:00" + time="80:00:00" shell: "unpigz -k -p 1 {input}" @@ -128,10 +126,9 @@ rule cutoffs_graph: graph = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs_graph_hap{n}.png") params: out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}") - threads: 1 resources: mem_mb=10000, - time="01:00:00" + time="80:00:00" container: f"{container_registry}/matplotlib0.11.5" shell: @@ -146,7 +143,7 @@ rule genometools_on_raw_data: priority: 1 resources: mem_mb=100000, - time="10:00:00" + time="80:00:00" threads: 4 container: f"{container_registry}/genometools1.5.9" @@ -174,7 +171,7 @@ rule busco_hap1: threads: 20 resources: mem_mb=100000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/busco:5.7.1" shell: @@ -205,7 +202,7 @@ rule find_telomeres: threads: 4 resources: mem_mb=40000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/biopython1.75" shell: @@ -222,8 +219,8 @@ rule jellyfish: km_size = get_kmer_size threads: 10 resources: - mem_mb=8000, - time="10:00:00" + mem_mb=80000, + time="80:00:00" container: f"{container_registry}/jellyfish2.3.0" shell: @@ -247,7 +244,7 @@ rule genomescope: out_dir = os.path.join(output_dir, "{sample}_results", "03_raw_data_qc", "genomescope"), resources: mem_mb=40000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/genomescope2.0" shell: @@ -266,7 +263,7 @@ rule kat: threads: 4 resources: mem_mb=40000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/kat2.4.1" shell: @@ -287,7 +284,7 @@ rule meryl: threads: 20 resources: mem_mb=60000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/merqury1.3" shell: @@ -308,7 +305,7 @@ rule merqury: threads: 20 resources: mem_mb=60000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/merqury1.3" shell: @@ -333,7 +330,7 @@ rule LTR_finder: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "LTR", "{sample}_hap{n}.scn") resources: mem_mb=60000, - time="50:00:00" + time="80:00:00" container: f"{container_registry}/ltr_finder:latest" shell: @@ -384,7 +381,7 @@ rule scafolding: threads: 4 resources: mem_mb = 80000, - time = "10:00:00" + time="80:00:00" container: f"{container_registry}/ragtag:2.0.1" shell: @@ -465,7 +462,7 @@ rule generate_report: r2 = lambda wildcards: get_run(wildcards, run=2) resources: mem_mb=10000, - time="10:00:00" + time="80:00:00" container: f"{container_registry}/rmarkdown4.0.3" script: -- GitLab From 9dac0a150b55b26ea9553234d4fa91871da9a427 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Thu, 16 Jan 2025 16:23:59 +0100 Subject: [PATCH 167/178] increase runtime --- .config/snakemake/profiles/slurm/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.config/snakemake/profiles/slurm/config.yaml b/.config/snakemake/profiles/slurm/config.yaml index 7f38863..628156b 100644 --- a/.config/snakemake/profiles/slurm/config.yaml +++ b/.config/snakemake/profiles/slurm/config.yaml @@ -6,4 +6,4 @@ keep-going: True default-resources: #slurm_account: add if needed on your hpc - runtime: 60 \ No newline at end of file + runtime: 80:80:80 \ No newline at end of file -- GitLab From 150c5c348c8ee5d1eabfff94763a470b7de0e535 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Thu, 16 Jan 2025 16:57:10 +0100 Subject: [PATCH 168/178] change time to base min --- .config/snakemake/profiles/slurm/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.config/snakemake/profiles/slurm/config.yaml b/.config/snakemake/profiles/slurm/config.yaml index 628156b..a1d542a 100644 --- a/.config/snakemake/profiles/slurm/config.yaml +++ b/.config/snakemake/profiles/slurm/config.yaml @@ -6,4 +6,4 @@ keep-going: True default-resources: #slurm_account: add if needed on your hpc - runtime: 80:80:80 \ No newline at end of file + runtime: 4800 \ No newline at end of file -- GitLab From f09daaaa5fa5eb032477784b478b193e44bf792d Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-1.cluster> Date: Thu, 16 Jan 2025 16:57:44 +0100 Subject: [PATCH 169/178] use argument prasing --- workflow/scripts/input_conversion.sh | 170 ++++++--------------------- 1 file changed, 38 insertions(+), 132 deletions(-) diff --git a/workflow/scripts/input_conversion.sh b/workflow/scripts/input_conversion.sh index 8016f14..189d5f0 100644 --- a/workflow/scripts/input_conversion.sh +++ b/workflow/scripts/input_conversion.sh @@ -1,151 +1,57 @@ #!/bin/bash -# Script to convert BAM or FASTQ files to FASTA.gz, and run quality control tools (LongQC or FastQC) with customizable output directories +# Script to convert BAM or FASTQ files to FASTA.gz # Author: Lucien PIAT # Date: January 13, 2025 -# Arguments -INPUT_FILE="$1" -OUTPUT_DIR="$2" -QC_OUTPUT_DIR="$3" - -# Check if the output directory is specified, otherwise default to current directory -if [ -z "$OUTPUT_DIR" ]; then - OUTPUT_DIR="." -fi - -# Check if the QC output directory is specified, otherwise default to current directory -if [ -z "$QC_OUTPUT_DIR" ]; then - QC_OUTPUT_DIR="." -fi - -# Create a recap file in the output directory -RECAP_FILE="$OUTPUT_DIR/recap.txt" -echo "Recap of transformations and QC steps" > "$RECAP_FILE" -echo "=====================================" >> "$RECAP_FILE" -echo "Input file: $INPUT_FILE" >> "$RECAP_FILE" -echo "Output directory: $OUTPUT_DIR" >> "$RECAP_FILE" -echo "QC output directory: $QC_OUTPUT_DIR" >> "$RECAP_FILE" -echo "" >> "$RECAP_FILE" - -# Function to convert BAM to FASTA.gz -convert_bam_to_fasta() { - BAM_FILE="$1" - OUTPUT_FILE="$2" - - echo "Converting BAM to FASTA: $BAM_FILE -> $OUTPUT_FILE" - samtools fasta "$BAM_FILE" | gzip > "$OUTPUT_FILE" - if [ $? -eq 0 ]; then - echo "Conversion completed: $OUTPUT_FILE" - echo "Converted BAM to FASTA: $BAM_FILE -> $OUTPUT_FILE" >> "$RECAP_FILE" - else - echo "Error: BAM to FASTA conversion failed!" - exit 1 - fi -} - -# Function to convert FASTQ to FASTA.gz -convert_fastq_to_fasta() { - FASTQ_FILE="$1" - OUTPUT_FILE="$2" - - echo "Converting FASTQ to FASTA: $FASTQ_FILE -> $OUTPUT_FILE" - seqtk seq -a "$FASTQ_FILE" | gzip > "$OUTPUT_FILE" - if [ $? -eq 0 ]; then - echo "Conversion completed: $OUTPUT_FILE" - echo "Converted FASTQ to FASTA: $FASTQ_FILE -> $OUTPUT_FILE" >> "$RECAP_FILE" - else - echo "Error: FASTQ to FASTA conversion failed!" - exit 1 - fi -} - -# Function to zip a FASTA file -zip_fasta() { - FASTA_FILE="$1" - OUTPUT_FILE="$2" - - echo "Zipping FASTA: $FASTA_FILE -> $OUTPUT_FILE" - gzip -c "$FASTA_FILE" > "$OUTPUT_FILE" - if [ $? -eq 0 ]; then - echo "Zipping completed: $OUTPUT_FILE" - echo "Zipped FASTA: $FASTA_FILE -> $OUTPUT_FILE" >> "$RECAP_FILE" - else - echo "Error: FASTA zipping failed!" - exit 1 - fi -} - -# Function to run LongQC on a BAM file -run_longqc() { - BAM_FILE="$1" - QC_OUTPUT="$2" - - echo "Running LongQC on BAM file: $BAM_FILE" - longqc "$BAM_FILE" -o "$QC_OUTPUT" - if [ $? -eq 0 ]; then - echo "LongQC completed successfully. Results saved to $QC_OUTPUT" - echo "LongQC completed on BAM: $BAM_FILE -> $QC_OUTPUT" >> "$RECAP_FILE" - else - echo "Error: LongQC failed!" - exit 1 - fi -} - -# Function to run FastQC on a FASTQ file -run_fastqc() { - FASTQ_FILE="$1" - QC_OUTPUT="$2" +#!/bin/bash - echo "Running FastQC on FASTQ file: $FASTQ_FILE" - fastqc "$FASTQ_FILE" --outdir="$QC_OUTPUT" - if [ $? -eq 0 ]; then - echo "FastQC completed successfully. Results saved to $QC_OUTPUT" - echo "FastQC completed on FASTQ: $FASTQ_FILE -> $QC_OUTPUT" >> "$RECAP_FILE" - else - echo "Error: FastQC failed!" - exit 1 - fi +# Function to display usage +usage() { + echo "Usage: $0 -i <input_file> -o <output_file>" + echo "Supported input formats: .bam, .fastq, .fastq.gz" + echo "Output will be a compressed FASTA (.fasta.gz) file." + exit 1 } -# Check file extension and process accordingly -FILE_EXTENSION="${INPUT_FILE##*.}" +# Parse arguments +while getopts ":i:o:" opt; do + case $opt in + i) input_file="$OPTARG" ;; + o) output_file="$OPTARG" ;; + *) usage ;; + esac +done + +# Check if input and output files are provided +if [ -z "$input_file" ] || [ -z "$output_file" ]; then + usage +fi -# Ensure the input file exists -if [ ! -f "$INPUT_FILE" ]; then - echo "Error: Input file does not exist!" +# Ensure necessary tools are installed +if ! command -v samtools &> /dev/null || ! command -v seqtk &> /dev/null; then + echo "Error: 'samtools' and 'seqtk' are required but not installed." exit 1 fi -# Handle different file types -case "$FILE_EXTENSION" in - bam) - # If it's a BAM file, convert it to FASTA.gz and run LongQC - OUTPUT_FILE="$OUTPUT_DIR/$(basename "$INPUT_FILE" .bam).fasta.gz" - convert_bam_to_fasta "$INPUT_FILE" "$OUTPUT_FILE" - run_longqc "$INPUT_FILE" "$QC_OUTPUT_DIR" +# Determine the input file type and convert +case "$input_file" in + *.bam) + echo "Processing BAM file..." + samtools fasta "$input_file" | gzip > "$output_file" ;; - fastq) - # If it's a FASTQ file, convert it to FASTA.gz and run FastQC - OUTPUT_FILE="$OUTPUT_DIR/$(basename "$INPUT_FILE" .fastq).fasta.gz" - if [[ "$INPUT_FILE" == *.gz ]]; then - # If the FASTQ file is gzipped, unzip before converting - gunzip -c "$INPUT_FILE" | seqtk seq -a | gzip > "$OUTPUT_FILE" - else - convert_fastq_to_fasta "$INPUT_FILE" "$OUTPUT_FILE" - fi - run_fastqc "$INPUT_FILE" "$QC_OUTPUT_DIR" + *.fastq) + echo "Processing FASTQ file..." + seqtk seq -A "$input_file" | gzip > "$output_file" ;; - fasta) - # If it's already a FASTA file, just zip it - OUTPUT_FILE="$OUTPUT_DIR/$(basename "$INPUT_FILE" .fasta).fasta.gz" - zip_fasta "$INPUT_FILE" "$OUTPUT_FILE" + *.fastq.gz) + echo "Processing compressed FASTQ file..." + seqtk seq -A <(gzip -dc "$input_file") | gzip > "$output_file" ;; *) - echo "Error: Unsupported file type: $FILE_EXTENSION" + echo "Error: Unsupported file type. Please provide a .bam, .fastq, or .fastq.gz file." exit 1 ;; esac -echo "Processing completed. Output saved to: $OUTPUT_DIR" -echo "Quality control results saved to: $QC_OUTPUT_DIR" -echo "Recap saved to: $RECAP_FILE" +# Confirm completion +echo "Conversion complete. Output written to: $output_file" -- GitLab From 2070a92d149256c0b20dea65b6e42d2939298d24 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lucien.piat@inrae.fr> Date: Fri, 17 Jan 2025 11:31:27 +0100 Subject: [PATCH 170/178] remove old files --- workflow/scripts/report_trio.Rmd | 167 ------------------------------- workflow/unimplemented.smk | 87 ---------------- 2 files changed, 254 deletions(-) delete mode 100644 workflow/scripts/report_trio.Rmd delete mode 100644 workflow/unimplemented.smk diff --git a/workflow/scripts/report_trio.Rmd b/workflow/scripts/report_trio.Rmd deleted file mode 100644 index 380500c..0000000 --- a/workflow/scripts/report_trio.Rmd +++ /dev/null @@ -1,167 +0,0 @@ ---- -title: "Assembly Report" -author: - - -date: "`r format(Sys.time(), '%d %B, %Y')`" -params: - rmd: "report_trio.Rmd" -output: - html_document: - highlight: tango - number_sections: no - theme: default - toc: yes - toc_depth: 3 - toc_float: - collapsed: no - smooth_scroll: yes ---- - ----- - -# `r snakemake@params[["id"]]` - run: `r snakemake@params[["run"]]` -* Run : `r snakemake@params[["run"]]` -* Hifiasm mode : `r snakemake@params[["mode"]]` -* Hifiasm purge mode [0-3]: `r snakemake@params[["purge_force"]]` -* Parent 1 : `r snakemake@params[["p1"]]` -* Parent 2 : `r snakemake@params[["p2"]]` -* Purge conducted: `r if (snakemake@params[["purge"]]) { "Yes" } else { "No" }` - ----- - -## Raw data QC - - -### Reads statistics -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["gt_reads"]]), sep = '\n') -``` - ----- - -## Assembly QC - Hifiasm -### Assembly statistics -#### Hap 1 -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["gt_asm_1"]]), sep = '\n') -``` - -#### Hap 2 -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["gt_asm_2"]]), sep = '\n') -``` - -### K-mer profiles -| Hap 1 | Hap 2 | -|-------|-------| -|  |  | - -### BUSCO -#### Hap 1 -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["busco_1"]]), sep = '\n') -``` -#### Hap 2 -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["busco_2"]]), sep = '\n') -``` - -### Phasing -* Parent 1 : `r snakemake@params[["p1"]]` -* Parent 2 : `r snakemake@params[["p2"]]` - - - -Blocks and switch error rate - -| Hap 1 | Hap 2 | -|-------|-------| -|  |  | - -#### Hap 1 -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["merq_block_stats_1"]]), sep = '\n') -``` -#### Hap 2 -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["merq_block_stats_2"]]), sep = '\n') -``` - -### K-mer completeness and error rate -Completeness - -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["merq_comp"]]), sep = '\n') -``` - -Error rate -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["merq_err"]]), sep = '\n') -``` - -### Telomeres -Telomeres present in assembly - -#### Hap 1 -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["tel_1"]]), sep = '\n') -``` -#### Hap 2 -```{r comment='', echo=FALSE} -cat(readLines(snakemake@input[["tel_2"]]), sep = '\n') -``` - ----- - -`r if (snakemake@params[["purge"]]) { "## Assembly QC - After Purge_dups" }` -`r if (snakemake@params[["purge"]]) { "### Assembly statistics" }` -`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_gt_asm_1"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_gt_asm_2"]]), sep = '\n') -``` - -`r if (snakemake@params[["purge"]]) { "### BUSCO" }` -`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_busco_1"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_busco_2"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "### Phasing" }` -`r if (snakemake@params[["purge"]]) { * Parent 1 : `r snakemake@params[["p1"]]` }` -`r if (snakemake@params[["purge"]]) { * Parent 2 : `r snakemake@params[["p2"]]` }` -`r if (snakemake@params[["purge"]]) { `r snakemake@input[["P_merq_blob"]]`}` - -`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_merq_block_stats_1"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_merq_block_stats_2"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "### K-mer completeness and error rate" }` -`r if (snakemake@params[["purge"]]) { "Completeness" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_merq_comp"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "Error rate " }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_merq_err"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "### Telomeres" }` -`r if (snakemake@params[["purge"]]) { "Telomeres present in assembly" }` -`r if (snakemake@params[["purge"]]) { "#### Hap 1" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_tel_1"]]), sep = '\n') -``` -`r if (snakemake@params[["purge"]]) { "#### Hap 2" }` -```{r comment='', echo=FALSE, eval=snakemake@params[["purge"]]} -cat(readLines(snakemake@input[["P_tel_2"]]), sep = '\n') -``` \ No newline at end of file diff --git a/workflow/unimplemented.smk b/workflow/unimplemented.smk deleted file mode 100644 index f4e3a87..0000000 --- a/workflow/unimplemented.smk +++ /dev/null @@ -1,87 +0,0 @@ - -rule meryl_trio: - input: - p1 = get_p1, - p2 = get_p2 - output: - p1 = directory(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_P1_reads-db_k21.meryl"), - p2 = directory(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_P2_reads-db_k21.meryl") - benchmark: - res_path + "/{runid}/benchmark/{id}_meryl_trio.txt" - threads: 10 - resources: - mem_mb=60000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/merqury1.3" - shell: - "meryl k=21 count {input.p1} output {output.p1} && " - "meryl k=21 count {input.p2} output {output.p2}" - -rule merqury_trio: - input: - p1 = rules.meryl_trio.output.p1, - p2 = rules.meryl_trio.output.p2, - read_db = rules.meryl.output, - hap1 = rules.unzip.output.hap1, - hap2 = rules.unzip.output.hap2 - output: - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury_trio.qv", - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury_trio.completeness.stats", - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury_trio.{id}_hap1.block.N.png", - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury_trio.{id}_hap2.block.N.png", - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury_trio.{id}_hap1.100_20000.phased_block.stats", - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury_trio.{id}_hap2.100_20000.phased_block.stats", - res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_merqury_trio.hapmers.blob.png", - p1_hapmer = directory(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_P1_reads-db_k21.hapmer.meryl"), - p2_hapmer = directory(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_P2_reads-db_k21.hapmer.meryl") - params: - path = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury", - prefix = "{id}_merqury_trio" - benchmark: - res_path + "/{runid}/benchmark/{id}_merqury_trio.txt" - threads: 20 - resources: - mem_mb=60000 - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/merqury1.3" - shell: - "cd {params.path} && " - "export MERQURY=/usr/local/share/merqury && " - "$MERQURY/trio/hapmers.sh {input.p1} {input.p2} {input.read_db} && " - "merqury.sh {input.read_db} {output.p1_hapmer} {output.p2_hapmer} {input.hap1} {input.hap2} {params.prefix}" - -rule no_purge_report_trio: - input: - # Reads QC - genomescope=RAW_QC + "/04_kmer/{id}_genomescope/linear_plot.png", - gt_reads=RAW_QC + "/03_genometools/{id}.RawStat.txt", - # Hifiasm assembly QC - gt_asm_1=ASM_QC + "/assembly_stats/{id}_hap1.AStats.txt", - gt_asm_2=ASM_QC + "/assembly_stats/{id}_hap2.AStats.txt", - busco_1=ASM_QC + "/busco/{id}_hap1/short_summary.specific.{lin}.{id}_hap1.txt", - busco_2=ASM_QC + "/busco/{id}_hap2/short_summary.specific.{lin}.{id}_hap2.txt", - kplot_1=ASM_QC + "/katplot/hap1/{id}_hap1.katplot.png", - kplot_2=ASM_QC + "/katplot/hap2/{id}_hap2.katplot.png", - tel_1=ASM_QC + "/telomeres/{id}_hap1_telomeres.txt", - tel_2=ASM_QC + "/telomeres/{id}_hap2_telomeres.txt", - merq_comp=ASM_QC + "/merqury/{id}_merqury_trio.completeness.stats", - merq_err=ASM_QC + "/merqury/{id}_merqury_trio.qv", - merq_blob=ASM_QC + "/merqury/{id}_merqury_trio.hapmers.blob.png", - merq_block_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.block.N.png", - merq_block_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.block.N.png", - merq_block_stats_1=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap1.100_20000.phased_block.stats", - merq_block_stats_2=ASM_QC + "/merqury/{id}_merqury_trio.{id}_hap2.100_20000.phased_block.stats" - output: - res_path + "/{runid}/{id}/{lin}/report_trio.html" - params: - id="{id}", # get filename - mode=get_mode, # get assembly mode - p1=get_p1, - p2=get_p2, - run=get_run, - purge=get_purge, - purge_force = get_purge_force - container: - "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/rmarkdown4.0.3" - script: - "../scripts/report_trio.Rmd" \ No newline at end of file -- GitLab From 044f61ca79d93980e50e81a2df5d1a0344305179 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-2.cluster> Date: Fri, 17 Jan 2025 11:35:53 +0100 Subject: [PATCH 171/178] update to hifiasm:0.24.0-r703 --- doc/dag.svg | 326 ++++++++++++++++--------------- doc/software_list.md | 2 +- workflow/Snakefile | 3 +- workflow/scripts/hifiasm_call.sh | 6 +- 4 files changed, 178 insertions(+), 159 deletions(-) diff --git a/doc/dag.svg b/doc/dag.svg index 24b683f..1ea1fae 100644 --- a/doc/dag.svg +++ b/doc/dag.svg @@ -1,301 +1,319 @@ <?xml version="1.0" standalone="no"?> -<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="778pt" height="548pt" viewBox="0.00 0.00 778.00 548.00"> -<g id="graph0" class="graph" transform="translate(4,544) scale(1)" data-name="snakemake_dag"> +<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="833pt" height="548pt" viewBox="0.00 0.00 833.00 548.00"> +<g id="graph0" class="graph" transform="translate(6.404022216796875,529.5757446289062) scale(1)" data-name="snakemake_dag"> -<polygon fill="white" stroke="none" points="-4,4 -4,-544 774,-544 774,4 -4,4" style=""/> +<polygon fill="white" stroke="none" points="-4,4 -4,-544 829,-544 829,4 -4,4" style=""/> <!-- 0 --> <g id="node1" class="node" pointer-events="visible" data-name="0"> -<path fill="none" stroke="#d88d56" stroke-width="2" d="M253,-36C253,-36 223,-36 223,-36 217,-36 211,-30 211,-24 211,-24 211,-12 211,-12 211,-6 217,0 223,0 223,0 253,0 253,0 259,0 265,-6 265,-12 265,-12 265,-24 265,-24 265,-30 259,-36 253,-36" style=""/> -<text text-anchor="middle" x="238" y="-15" font-family="sans" font-size="10.00" style="">all</text> +<path fill="none" stroke="#afd856" stroke-width="2" d="M126,-36C126,-36 96,-36 96,-36 90,-36 84,-30 84,-24 84,-24 84,-12 84,-12 84,-6 90,0 96,0 96,0 126,0 126,0 132,0 138,-6 138,-12 138,-12 138,-24 138,-24 138,-30 132,-36 126,-36" style=""/> +<text text-anchor="middle" x="111" y="-15" font-family="sans" font-size="10.00" style="">all</text> </g> <!-- 1 --> <g id="node2" class="node" pointer-events="visible" data-name="1"> -<path fill="none" stroke="#d85656" stroke-width="2" d="M432.16,-396C432.16,-396 363.84,-396 363.84,-396 357.84,-396 351.84,-390 351.84,-384 351.84,-384 351.84,-372 351.84,-372 351.84,-366 357.84,-360 363.84,-360 363.84,-360 432.16,-360 432.16,-360 438.16,-360 444.16,-366 444.16,-372 444.16,-372 444.16,-384 444.16,-384 444.16,-390 438.16,-396 432.16,-396" style=""/> -<text text-anchor="middle" x="398" y="-375" font-family="sans" font-size="10.00" style="">haplotigs_handling</text> +<path fill="none" stroke="#56d892" stroke-width="2" d="M329.16,-396C329.16,-396 260.84,-396 260.84,-396 254.84,-396 248.84,-390 248.84,-384 248.84,-384 248.84,-372 248.84,-372 248.84,-366 254.84,-360 260.84,-360 260.84,-360 329.16,-360 329.16,-360 335.16,-360 341.16,-366 341.16,-372 341.16,-372 341.16,-384 341.16,-384 341.16,-390 335.16,-396 329.16,-396" style=""/> +<text text-anchor="middle" x="295" y="-375" font-family="sans" font-size="10.00" style="">haplotigs_handling</text> </g> <!-- 1->0 --> -<g id="edge1" class="edge" data-name="1->0"> +<g id="edge3" class="edge" data-name="1->0"> -<path fill="none" stroke="grey" stroke-width="2" d="M444.92,-374.87C526.99,-368.35 688,-341.93 688,-235 688,-235 688,-235 688,-161 688,-75.57 389.35,-35.09 278.65,-23.02" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="279.13,-19.55 268.82,-21.98 278.39,-26.52 279.13,-19.55" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M250.93,-359.05C207.89,-339.24 143.79,-303.21 108,-252 61.31,-185.2 48.06,-149.59 73,-72 75.87,-63.06 80.87,-54.35 86.32,-46.66" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="88.94,-49 92.29,-38.95 83.4,-44.72 88.94,-49" style=""/> </g> <!-- 5 --> <g id="node6" class="node" pointer-events="visible" data-name="5"> -<path fill="none" stroke="#56d8a2" stroke-width="2" d="M248.87,-324C248.87,-324 193.13,-324 193.13,-324 187.13,-324 181.13,-318 181.13,-312 181.13,-312 181.13,-300 181.13,-300 181.13,-294 187.13,-288 193.13,-288 193.13,-288 248.87,-288 248.87,-288 254.87,-288 260.87,-294 260.87,-300 260.87,-300 260.87,-312 260.87,-312 260.87,-318 254.87,-324 248.87,-324" style=""/> -<text text-anchor="middle" x="221" y="-303" font-family="sans" font-size="10.00" style="">unpigz_to_fasta</text> +<path fill="none" stroke="#5673d8" stroke-width="2" d="M322.87,-324C322.87,-324 267.13,-324 267.13,-324 261.13,-324 255.13,-318 255.13,-312 255.13,-312 255.13,-300 255.13,-300 255.13,-294 261.13,-288 267.13,-288 267.13,-288 322.87,-288 322.87,-288 328.87,-288 334.87,-294 334.87,-300 334.87,-300 334.87,-312 334.87,-312 334.87,-318 328.87,-324 322.87,-324" style=""/> +<text text-anchor="middle" x="295" y="-303" font-family="sans" font-size="10.00" style="">unpigz_to_fasta</text> </g> <!-- 1->5 --> -<g id="edge8" class="edge" data-name="1->5"> +<g id="edge7" class="edge" data-name="1->5"> -<path fill="none" stroke="grey" stroke-width="2" d="M352.88,-359.15C328.76,-349.62 298.93,-337.82 273.75,-327.86" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="275.2,-324.67 264.62,-324.25 272.63,-331.18 275.2,-324.67" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M295,-359.34C295,-352.75 295,-345.08 295,-337.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="298.5,-337.93 295,-327.93 291.5,-337.93 298.5,-337.93" style=""/> </g> <!-- 10 --> <g id="node11" class="node" pointer-events="visible" data-name="10"> -<path fill="none" stroke="#d6d856" stroke-width="2" d="M548.32,-324C548.32,-324 447.68,-324 447.68,-324 441.68,-324 435.68,-318 435.68,-312 435.68,-312 435.68,-300 435.68,-300 435.68,-294 441.68,-288 447.68,-288 447.68,-288 548.32,-288 548.32,-288 554.32,-288 560.32,-294 560.32,-300 560.32,-300 560.32,-312 560.32,-312 560.32,-318 554.32,-324 548.32,-324" style=""/> -<text text-anchor="middle" x="498" y="-303" font-family="sans" font-size="10.00" style="">genometools_on_assembly</text> +<path fill="none" stroke="#56d87b" stroke-width="2" d="M741.32,-324C741.32,-324 640.68,-324 640.68,-324 634.68,-324 628.68,-318 628.68,-312 628.68,-312 628.68,-300 628.68,-300 628.68,-294 634.68,-288 640.68,-288 640.68,-288 741.32,-288 741.32,-288 747.32,-288 753.32,-294 753.32,-300 753.32,-300 753.32,-312 753.32,-312 753.32,-318 747.32,-324 741.32,-324" style=""/> +<text text-anchor="middle" x="691" y="-303" font-family="sans" font-size="10.00" style="">genometools_on_assembly</text> </g> <!-- 1->10 --> -<g id="edge18" class="edge" data-name="1->10"> +<g id="edge19" class="edge" data-name="1->10"> -<path fill="none" stroke="grey" stroke-width="2" d="M423.49,-359.15C435.25,-350.93 449.4,-341.02 462.2,-332.06" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="463.89,-335.15 470.07,-326.55 459.87,-329.42 463.89,-335.15" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M342.05,-370.05C404.58,-360.62 518.4,-342.79 615,-324 615.13,-323.97 615.27,-323.95 615.4,-323.92" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="615.94,-327.38 625.05,-321.97 614.56,-320.52 615.94,-327.38" style=""/> </g> -<!-- 12 --> -<g id="node13" class="node" pointer-events="visible" data-name="12"> +<!-- 13 --> +<g id="node14" class="node" pointer-events="visible" data-name="13"> -<path fill="none" stroke="#56d8c1" stroke-width="2" d="M380,-180C380,-180 350,-180 350,-180 344,-180 338,-174 338,-168 338,-168 338,-156 338,-156 338,-150 344,-144 350,-144 350,-144 380,-144 380,-144 386,-144 392,-150 392,-156 392,-156 392,-168 392,-168 392,-174 386,-180 380,-180" style=""/> -<text text-anchor="middle" x="365" y="-159" font-family="sans" font-size="10.00" style="">kat</text> +<path fill="none" stroke="#56d0d8" stroke-width="2" d="M661,-180C661,-180 631,-180 631,-180 625,-180 619,-174 619,-168 619,-168 619,-156 619,-156 619,-150 625,-144 631,-144 631,-144 661,-144 661,-144 667,-144 673,-150 673,-156 673,-156 673,-168 673,-168 673,-174 667,-180 661,-180" style=""/> +<text text-anchor="middle" x="646" y="-159" font-family="sans" font-size="10.00" style="">kat</text> </g> -<!-- 1->12 --> -<g id="edge20" class="edge" data-name="1->12"> +<!-- 1->13 --> +<g id="edge24" class="edge" data-name="1->13"> -<path fill="none" stroke="grey" stroke-width="2" d="M396.71,-359.01C394.38,-329.12 388.78,-267.43 379,-216 377.56,-208.43 375.63,-200.34 373.66,-192.82" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="377.11,-192.16 371.09,-183.44 370.36,-194.01 377.11,-192.16" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M342.09,-364.95C402.58,-347.69 508.52,-311.31 582,-252 602.36,-235.57 619.31,-211.12 630.64,-192" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="633.55,-193.96 635.45,-183.54 627.46,-190.5 633.55,-193.96" style=""/> </g> -<!-- 16 --> -<g id="node17" class="node" pointer-events="visible" data-name="16"> +<!-- 17 --> +<g id="node18" class="node" pointer-events="visible" data-name="17"> -<path fill="none" stroke="#88d856" stroke-width="2" d="M758,-324C758,-324 728,-324 728,-324 722,-324 716,-318 716,-312 716,-312 716,-300 716,-300 716,-294 722,-288 728,-288 728,-288 758,-288 758,-288 764,-288 770,-294 770,-300 770,-300 770,-312 770,-312 770,-318 764,-324 758,-324" style=""/> -<text text-anchor="middle" x="743" y="-303" font-family="sans" font-size="10.00" style="">merqury</text> +<path fill="none" stroke="#61d856" stroke-width="2" d="M813,-324C813,-324 783,-324 783,-324 777,-324 771,-318 771,-312 771,-312 771,-300 771,-300 771,-294 777,-288 783,-288 783,-288 813,-288 813,-288 819,-288 825,-294 825,-300 825,-300 825,-312 825,-312 825,-318 819,-324 813,-324" style=""/> +<text text-anchor="middle" x="798" y="-303" font-family="sans" font-size="10.00" style="">merqury</text> </g> -<!-- 1->16 --> -<g id="edge26" class="edge" data-name="1->16"> +<!-- 1->17 --> +<g id="edge29" class="edge" data-name="1->17"> -<path fill="none" stroke="grey" stroke-width="2" d="M445.08,-372.06C505.46,-364.98 613,-349.87 702,-324 702.27,-323.92 702.54,-323.84 702.81,-323.76" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="703.89,-327.09 712.2,-320.53 701.61,-320.47 703.89,-327.09" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M342.04,-376.71C425.13,-375.13 602.98,-366.92 757.54,-324.77" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="758.44,-328.15 767.13,-322.09 756.55,-321.41 758.44,-328.15" style=""/> </g> -<!-- 18 --> -<g id="node19" class="node" pointer-events="visible" data-name="18"> +<!-- 19 --> +<g id="node20" class="node" pointer-events="visible" data-name="19"> -<path fill="none" stroke="#59d856" stroke-width="2" d="M42,-180C42,-180 12,-180 12,-180 6,-180 0,-174 0,-168 0,-168 0,-156 0,-156 0,-150 6,-144 12,-144 12,-144 42,-144 42,-144 48,-144 54,-150 54,-156 54,-156 54,-168 54,-168 54,-174 48,-180 42,-180" style=""/> -<text text-anchor="middle" x="27" y="-159" font-family="sans" font-size="10.00" style="">quast</text> +<path fill="none" stroke="#56d8a9" stroke-width="2" d="M42,-324C42,-324 12,-324 12,-324 6,-324 0,-318 0,-312 0,-312 0,-300 0,-300 0,-294 6,-288 12,-288 12,-288 42,-288 42,-288 48,-288 54,-294 54,-300 54,-300 54,-312 54,-312 54,-318 48,-324 42,-324" style=""/> +<text text-anchor="middle" x="27" y="-303" font-family="sans" font-size="10.00" style="">quast</text> </g> -<!-- 1->18 --> -<g id="edge28" class="edge" data-name="1->18"> +<!-- 1->19 --> +<g id="edge32" class="edge" data-name="1->19"> -<path fill="none" stroke="grey" stroke-width="2" d="M351.07,-369.1C257.54,-352.23 53.7,-309.51 17,-252 5.91,-234.62 8.99,-211.36 14.54,-192.98" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="17.77,-194.35 17.75,-183.75 11.16,-192.05 17.77,-194.35" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M247.87,-368.34C201.53,-359.28 129.06,-343.64 68,-324 67.59,-323.87 67.17,-323.73 66.76,-323.59" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="68,-320.32 57.4,-320.18 65.6,-326.9 68,-320.32" style=""/> </g> <!-- 2 --> <g id="node3" class="node" pointer-events="visible" data-name="2"> -<path fill="none" stroke="#d8bc56" stroke-width="2" d="M210.64,-468C210.64,-468 147.36,-468 147.36,-468 141.36,-468 135.36,-462 135.36,-456 135.36,-456 135.36,-444 135.36,-444 135.36,-438 141.36,-432 147.36,-432 147.36,-432 210.64,-432 210.64,-432 216.64,-432 222.64,-438 222.64,-444 222.64,-444 222.64,-456 222.64,-456 222.64,-462 216.64,-468 210.64,-468" style=""/> -<text text-anchor="middle" x="179" y="-447" font-family="sans" font-size="10.00" style="">pigz_gfa_to_fasta</text> +<path fill="none" stroke="#d8bc56" stroke-width="2" d="M200.64,-468C200.64,-468 137.36,-468 137.36,-468 131.36,-468 125.36,-462 125.36,-456 125.36,-456 125.36,-444 125.36,-444 125.36,-438 131.36,-432 137.36,-432 137.36,-432 200.64,-432 200.64,-432 206.64,-432 212.64,-438 212.64,-444 212.64,-444 212.64,-456 212.64,-456 212.64,-462 206.64,-468 200.64,-468" style=""/> +<text text-anchor="middle" x="169" y="-447" font-family="sans" font-size="10.00" style="">pigz_gfa_to_fasta</text> </g> <!-- 2->1 --> -<g id="edge5" class="edge" data-name="2->1"> +<g id="edge4" class="edge" data-name="2->1"> -<path fill="none" stroke="grey" stroke-width="2" d="M223.58,-434.75C256.81,-424.13 302.52,-409.52 338.8,-397.92" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="339.87,-401.26 348.33,-394.88 337.74,-394.59 339.87,-401.26" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M201.12,-431.15C216.52,-422.6 235.2,-412.22 251.82,-402.99" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="253.25,-406.2 260.29,-398.28 249.85,-400.08 253.25,-406.2" style=""/> </g> -<!-- 2->18 --> -<g id="edge30" class="edge" data-name="2->18"> +<!-- 2->19 --> +<g id="edge31" class="edge" data-name="2->19"> -<path fill="none" stroke="grey" stroke-width="2" d="M153.61,-431.13C112.39,-400.28 32.84,-332.53 6,-252 -0.46,-232.61 4.89,-210.21 11.8,-192.74" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="14.92,-194.33 15.75,-183.76 8.52,-191.51 14.92,-194.33" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M151.02,-431.02C126.48,-406.48 82.3,-362.3 53.84,-333.84" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="56.52,-331.58 46.98,-326.98 51.58,-336.52 56.52,-331.58" style=""/> </g> <!-- 3 --> <g id="node4" class="node" pointer-events="visible" data-name="3"> -<path fill="none" stroke="#d8a456" stroke-width="2" d="M194,-540C194,-540 164,-540 164,-540 158,-540 152,-534 152,-528 152,-528 152,-516 152,-516 152,-510 158,-504 164,-504 164,-504 194,-504 194,-504 200,-504 206,-510 206,-516 206,-516 206,-528 206,-528 206,-534 200,-540 194,-540" style=""/> -<text text-anchor="middle" x="179" y="-519" font-family="sans" font-size="10.00" style="">hifiasm</text> +<path fill="none" stroke="#568ad8" stroke-width="2" d="M184,-540C184,-540 154,-540 154,-540 148,-540 142,-534 142,-528 142,-528 142,-516 142,-516 142,-510 148,-504 154,-504 154,-504 184,-504 184,-504 190,-504 196,-510 196,-516 196,-516 196,-528 196,-528 196,-534 190,-540 184,-540" style=""/> +<text text-anchor="middle" x="169" y="-519" font-family="sans" font-size="10.00" style="">hifiasm</text> </g> <!-- 3->2 --> -<g id="edge6" class="edge" data-name="3->2"> +<g id="edge5" class="edge" data-name="3->2"> -<path fill="none" stroke="grey" stroke-width="2" d="M179,-503.34C179,-496.75 179,-489.08 179,-481.67" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="182.5,-481.93 179,-471.93 175.5,-481.93 182.5,-481.93" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M169,-503.34C169,-496.75 169,-489.08 169,-481.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="172.5,-481.93 169,-471.93 165.5,-481.93 172.5,-481.93" style=""/> </g> <!-- 4 --> <g id="node5" class="node" pointer-events="visible" data-name="4"> -<path fill="none" stroke="#56d8d8" stroke-width="2" d="M71.66,-252C71.66,-252 38.34,-252 38.34,-252 32.34,-252 26.34,-246 26.34,-240 26.34,-240 26.34,-228 26.34,-228 26.34,-222 32.34,-216 38.34,-216 38.34,-216 71.66,-216 71.66,-216 77.66,-216 83.66,-222 83.66,-228 83.66,-228 83.66,-240 83.66,-240 83.66,-246 77.66,-252 71.66,-252" style=""/> -<text text-anchor="middle" x="55" y="-231" font-family="sans" font-size="10.00" style="">scafolding</text> +<path fill="none" stroke="#56a2d8" stroke-width="2" d="M127.66,-108C127.66,-108 94.34,-108 94.34,-108 88.34,-108 82.34,-102 82.34,-96 82.34,-96 82.34,-84 82.34,-84 82.34,-78 88.34,-72 94.34,-72 94.34,-72 127.66,-72 127.66,-72 133.66,-72 139.66,-78 139.66,-84 139.66,-84 139.66,-96 139.66,-96 139.66,-102 133.66,-108 127.66,-108" style=""/> +<text text-anchor="middle" x="111" y="-87" font-family="sans" font-size="10.00" style="">scafolding</text> </g> <!-- 4->0 --> -<g id="edge2" class="edge" data-name="4->0"> - -<path fill="none" stroke="grey" stroke-width="2" d="M70.29,-215.12C102.18,-177.83 175.99,-91.51 214.17,-46.86" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="216.82,-49.15 220.66,-39.28 211.5,-44.6 216.82,-49.15" style=""/> -</g> -<!-- 4->18 --> -<g id="edge29" class="edge" data-name="4->18"> +<g id="edge1" class="edge" data-name="4->0"> -<path fill="none" stroke="grey" stroke-width="2" d="M47.93,-215.34C45.2,-208.51 42,-200.5 38.93,-192.83" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="42.25,-191.7 35.28,-183.71 35.75,-194.3 42.25,-191.7" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M111,-71.34C111,-64.75 111,-57.08 111,-49.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="114.5,-49.93 111,-39.93 107.5,-49.93 114.5,-49.93" style=""/> </g> <!-- 5->4 --> -<g id="edge7" class="edge" data-name="5->4"> +<g id="edge6" class="edge" data-name="5->4"> -<path fill="none" stroke="grey" stroke-width="2" d="M180.24,-288.46C157.69,-279.24 128.92,-267.35 96.17,-253.31" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="97.77,-250.19 87.2,-249.46 95.01,-256.62 97.77,-250.19" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M254.26,-303.78C215.98,-300.29 160.21,-288.7 130,-252 99.84,-215.36 101.17,-157.14 105.59,-121.66" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="109.04,-122.23 107.01,-111.84 102.11,-121.23 109.04,-122.23" style=""/> </g> <!-- 11 --> <g id="node12" class="node" pointer-events="visible" data-name="11"> -<path fill="none" stroke="#d86e56" stroke-width="2" d="M264,-252C264,-252 234,-252 234,-252 228,-252 222,-246 222,-240 222,-240 222,-228 222,-228 222,-222 228,-216 234,-216 234,-216 264,-216 264,-216 270,-216 276,-222 276,-228 276,-228 276,-240 276,-240 276,-246 270,-252 264,-252" style=""/> -<text text-anchor="middle" x="249" y="-231" font-family="sans" font-size="10.00" style="">busco</text> +<path fill="none" stroke="#56b9d8" stroke-width="2" d="M440.77,-252C440.77,-252 401.23,-252 401.23,-252 395.23,-252 389.23,-246 389.23,-240 389.23,-240 389.23,-228 389.23,-228 389.23,-222 395.23,-216 401.23,-216 401.23,-216 440.77,-216 440.77,-216 446.77,-216 452.77,-222 452.77,-228 452.77,-228 452.77,-240 452.77,-240 452.77,-246 446.77,-252 440.77,-252" style=""/> +<text text-anchor="middle" x="421" y="-231" font-family="sans" font-size="10.00" style="">busco_hap1</text> </g> <!-- 5->11 --> -<g id="edge19" class="edge" data-name="5->11"> +<g id="edge20" class="edge" data-name="5->11"> -<path fill="none" stroke="grey" stroke-width="2" d="M228.07,-287.34C230.8,-280.51 234,-272.5 237.07,-264.83" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="240.25,-266.3 240.72,-255.71 233.75,-263.7 240.25,-266.3" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M327.12,-287.15C342.52,-278.6 361.2,-268.22 377.82,-258.99" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="379.25,-262.2 386.29,-254.28 375.85,-256.08 379.25,-262.2" style=""/> </g> -<!-- 13 --> -<g id="node14" class="node" pointer-events="visible" data-name="13"> +<!-- 12 --> +<g id="node13" class="node" pointer-events="visible" data-name="12"> -<path fill="none" stroke="#5673d8" stroke-width="2" d="M357.98,-252C357.98,-252 306.02,-252 306.02,-252 300.02,-252 294.02,-246 294.02,-240 294.02,-240 294.02,-228 294.02,-228 294.02,-222 300.02,-216 306.02,-216 306.02,-216 357.98,-216 357.98,-216 363.98,-216 369.98,-222 369.98,-228 369.98,-228 369.98,-240 369.98,-240 369.98,-246 363.98,-252 357.98,-252" style=""/> -<text text-anchor="middle" x="332" y="-231" font-family="sans" font-size="10.00" style="">find_telomeres</text> +<path fill="none" stroke="#d85656" stroke-width="2" d="M500.77,-180C500.77,-180 461.23,-180 461.23,-180 455.23,-180 449.23,-174 449.23,-168 449.23,-168 449.23,-156 449.23,-156 449.23,-150 455.23,-144 461.23,-144 461.23,-144 500.77,-144 500.77,-144 506.77,-144 512.77,-150 512.77,-156 512.77,-156 512.77,-168 512.77,-168 512.77,-174 506.77,-180 500.77,-180" style=""/> +<text text-anchor="middle" x="481" y="-159" font-family="sans" font-size="10.00" style="">busco_hap2</text> </g> -<!-- 5->13 --> -<g id="edge22" class="edge" data-name="5->13"> +<!-- 5->12 --> +<g id="edge21" class="edge" data-name="5->12"> -<path fill="none" stroke="grey" stroke-width="2" d="M249.3,-287.15C262.61,-278.76 278.69,-268.62 293.12,-259.52" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="294.6,-262.72 301.19,-254.43 290.87,-256.8 294.6,-262.72" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M335.85,-303.26C374.01,-299.36 429.81,-287.48 462,-252 476.27,-236.28 480.79,-212.54 481.87,-193.53" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="485.37,-193.8 482.07,-183.73 478.37,-193.66 485.37,-193.8" style=""/> </g> <!-- 14 --> <g id="node15" class="node" pointer-events="visible" data-name="14"> -<path fill="none" stroke="#56d88a" stroke-width="2" d="M218.75,-180C218.75,-180 169.25,-180 169.25,-180 163.25,-180 157.25,-174 157.25,-168 157.25,-168 157.25,-156 157.25,-156 157.25,-150 163.25,-144 169.25,-144 169.25,-144 218.75,-144 218.75,-144 224.75,-144 230.75,-150 230.75,-156 230.75,-156 230.75,-168 230.75,-168 230.75,-174 224.75,-180 218.75,-180" style=""/> -<text text-anchor="middle" x="194" y="-159" font-family="sans" font-size="10.00" style="">LTR_retriever</text> +<path fill="none" stroke="#d88556" stroke-width="2" d="M202.98,-252C202.98,-252 151.02,-252 151.02,-252 145.02,-252 139.02,-246 139.02,-240 139.02,-240 139.02,-228 139.02,-228 139.02,-222 145.02,-216 151.02,-216 151.02,-216 202.98,-216 202.98,-216 208.98,-216 214.98,-222 214.98,-228 214.98,-228 214.98,-240 214.98,-240 214.98,-246 208.98,-252 202.98,-252" style=""/> +<text text-anchor="middle" x="177" y="-231" font-family="sans" font-size="10.00" style="">find_telomeres</text> </g> <!-- 5->14 --> -<g id="edge23" class="edge" data-name="5->14"> +<g id="edge25" class="edge" data-name="5->14"> -<path fill="none" stroke="grey" stroke-width="2" d="M217.58,-287.02C213.1,-263.44 205.17,-221.73 199.75,-193.25" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="203.26,-192.98 197.96,-183.81 196.39,-194.29 203.26,-192.98" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M264.92,-287.15C250.63,-278.68 233.34,-268.42 217.89,-259.25" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="220.02,-256.45 209.63,-254.36 216.45,-262.47 220.02,-256.45" style=""/> </g> <!-- 15 --> <g id="node16" class="node" pointer-events="visible" data-name="15"> -<path fill="none" stroke="#568ad8" stroke-width="2" d="M153.76,-252C153.76,-252 114.24,-252 114.24,-252 108.24,-252 102.24,-246 102.24,-240 102.24,-240 102.24,-228 102.24,-228 102.24,-222 108.24,-216 114.24,-216 114.24,-216 153.76,-216 153.76,-216 159.76,-216 165.76,-222 165.76,-228 165.76,-228 165.76,-240 165.76,-240 165.76,-246 159.76,-252 153.76,-252" style=""/> -<text text-anchor="middle" x="134" y="-231" font-family="sans" font-size="10.00" style="">LTR_finder</text> +<path fill="none" stroke="#d8d356" stroke-width="2" d="M238.75,-180C238.75,-180 189.25,-180 189.25,-180 183.25,-180 177.25,-174 177.25,-168 177.25,-168 177.25,-156 177.25,-156 177.25,-150 183.25,-144 189.25,-144 189.25,-144 238.75,-144 238.75,-144 244.75,-144 250.75,-150 250.75,-156 250.75,-156 250.75,-168 250.75,-168 250.75,-174 244.75,-180 238.75,-180" style=""/> +<text text-anchor="middle" x="214" y="-159" font-family="sans" font-size="10.00" style="">LTR_retriever</text> </g> <!-- 5->15 --> -<g id="edge25" class="edge" data-name="5->15"> +<g id="edge26" class="edge" data-name="5->15"> + +<path fill="none" stroke="grey" stroke-width="2" d="M303.19,-287.34C310.67,-268.41 318.78,-238.11 306,-216 300.78,-206.97 282.2,-195.57 263.09,-185.66" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="264.77,-182.58 254.26,-181.22 261.63,-188.84 264.77,-182.58" style=""/> +</g> +<!-- 16 --> +<g id="node17" class="node" pointer-events="visible" data-name="16"> + +<path fill="none" stroke="#56d8c9" stroke-width="2" d="M284.76,-252C284.76,-252 245.24,-252 245.24,-252 239.24,-252 233.24,-246 233.24,-240 233.24,-240 233.24,-228 233.24,-228 233.24,-222 239.24,-216 245.24,-216 245.24,-216 284.76,-216 284.76,-216 290.76,-216 296.76,-222 296.76,-228 296.76,-228 296.76,-240 296.76,-240 296.76,-246 290.76,-252 284.76,-252" style=""/> +<text text-anchor="middle" x="265" y="-231" font-family="sans" font-size="10.00" style="">LTR_finder</text> +</g> +<!-- 5->16 --> +<g id="edge28" class="edge" data-name="5->16"> -<path fill="none" stroke="grey" stroke-width="2" d="M199.05,-287.34C189.13,-279.36 177.21,-269.77 166.3,-260.99" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="168.59,-258.34 158.61,-254.8 164.2,-263.8 168.59,-258.34" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M287.43,-287.34C284.5,-280.51 281.07,-272.5 277.79,-264.83" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="281.02,-263.5 273.86,-255.68 274.59,-266.25 281.02,-263.5" style=""/> </g> <!-- 6 --> <g id="node7" class="node" pointer-events="visible" data-name="6"> -<path fill="none" stroke="#56a2d8" stroke-width="2" d="M392.24,-108C392.24,-108 337.76,-108 337.76,-108 331.76,-108 325.76,-102 325.76,-96 325.76,-96 325.76,-84 325.76,-84 325.76,-78 331.76,-72 337.76,-72 337.76,-72 392.24,-72 392.24,-72 398.24,-72 404.24,-78 404.24,-84 404.24,-84 404.24,-96 404.24,-96 404.24,-102 398.24,-108 392.24,-108" style=""/> -<text text-anchor="middle" x="365" y="-87" font-family="sans" font-size="10.00" style="">generate_report</text> +<path fill="none" stroke="#c6d856" stroke-width="2" d="M448.24,-108C448.24,-108 393.76,-108 393.76,-108 387.76,-108 381.76,-102 381.76,-96 381.76,-96 381.76,-84 381.76,-84 381.76,-78 387.76,-72 393.76,-72 393.76,-72 448.24,-72 448.24,-72 454.24,-72 460.24,-78 460.24,-84 460.24,-84 460.24,-96 460.24,-96 460.24,-102 454.24,-108 448.24,-108" style=""/> +<text text-anchor="middle" x="421" y="-87" font-family="sans" font-size="10.00" style="">generate_report</text> </g> <!-- 6->0 --> -<g id="edge3" class="edge" data-name="6->0"> +<g id="edge2" class="edge" data-name="6->0"> -<path fill="none" stroke="grey" stroke-width="2" d="M332.62,-71.15C315.62,-61.78 294.67,-50.23 276.8,-40.39" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="278.68,-37.43 268.23,-35.66 275.3,-43.56 278.68,-37.43" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M380.79,-79.92C321.39,-66.51 210.82,-41.54 151.35,-28.11" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="152.25,-24.73 141.72,-25.94 150.71,-31.55 152.25,-24.73" style=""/> </g> <!-- 7 --> <g id="node8" class="node" pointer-events="visible" data-name="7"> -<path fill="none" stroke="#a7d856" stroke-width="2" d="M468.43,-180C468.43,-180 421.57,-180 421.57,-180 415.57,-180 409.57,-174 409.57,-168 409.57,-168 409.57,-156 409.57,-156 409.57,-150 415.57,-144 421.57,-144 421.57,-144 468.43,-144 468.43,-144 474.43,-144 480.43,-150 480.43,-156 480.43,-156 480.43,-168 480.43,-168 480.43,-174 474.43,-180 468.43,-180" style=""/> -<text text-anchor="middle" x="445" y="-159" font-family="sans" font-size="10.00" style="">genomescope</text> +<path fill="none" stroke="#d8a456" stroke-width="2" d="M589.43,-180C589.43,-180 542.57,-180 542.57,-180 536.57,-180 530.57,-174 530.57,-168 530.57,-168 530.57,-156 530.57,-156 530.57,-150 536.57,-144 542.57,-144 542.57,-144 589.43,-144 589.43,-144 595.43,-144 601.43,-150 601.43,-156 601.43,-156 601.43,-168 601.43,-168 601.43,-174 595.43,-180 589.43,-180" style=""/> +<text text-anchor="middle" x="566" y="-159" font-family="sans" font-size="10.00" style="">genomescope</text> </g> <!-- 7->6 --> -<g id="edge12" class="edge" data-name="7->6"> +<g id="edge13" class="edge" data-name="7->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M424.81,-143.34C415.79,-135.44 404.96,-125.96 395.01,-117.26" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="397.57,-114.85 387.74,-110.9 392.96,-120.11 397.57,-114.85" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M529.78,-143.52C511.64,-134.76 489.41,-124.02 469.78,-114.55" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="471.52,-111.5 460.99,-110.31 468.48,-117.81 471.52,-111.5" style=""/> </g> <!-- 8 --> <g id="node9" class="node" pointer-events="visible" data-name="8"> -<path fill="none" stroke="#56c1d8" stroke-width="2" d="M468,-252C468,-252 438,-252 438,-252 432,-252 426,-246 426,-240 426,-240 426,-228 426,-228 426,-222 432,-216 438,-216 438,-216 468,-216 468,-216 474,-216 480,-222 480,-228 480,-228 480,-240 480,-240 480,-246 474,-252 468,-252" style=""/> -<text text-anchor="middle" x="453" y="-231" font-family="sans" font-size="10.00" style="">jellyfish</text> +<path fill="none" stroke="#78d856" stroke-width="2" d="M561,-252C561,-252 531,-252 531,-252 525,-252 519,-246 519,-240 519,-240 519,-228 519,-228 519,-222 525,-216 531,-216 531,-216 561,-216 561,-216 567,-216 573,-222 573,-228 573,-228 573,-240 573,-240 573,-246 567,-252 561,-252" style=""/> +<text text-anchor="middle" x="546" y="-231" font-family="sans" font-size="10.00" style="">jellyfish</text> </g> <!-- 8->7 --> -<g id="edge17" class="edge" data-name="8->7"> +<g id="edge18" class="edge" data-name="8->7"> -<path fill="none" stroke="grey" stroke-width="2" d="M450.98,-215.34C450.23,-208.75 449.35,-201.08 448.5,-193.67" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="452,-193.45 447.39,-183.91 445.05,-194.24 452,-193.45" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M551.05,-215.34C552.95,-208.67 555.18,-200.89 557.32,-193.39" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="560.67,-194.39 560.05,-183.81 553.94,-192.47 560.67,-194.39" style=""/> </g> -<!-- 8->12 --> -<g id="edge21" class="edge" data-name="8->12"> +<!-- 8->13 --> +<g id="edge23" class="edge" data-name="8->13"> -<path fill="none" stroke="grey" stroke-width="2" d="M430.79,-215.34C420.76,-207.36 408.71,-197.77 397.67,-188.99" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="399.88,-186.27 389.87,-182.79 395.52,-191.75 399.88,-186.27" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M571.49,-215.15C583.25,-206.93 597.4,-197.02 610.2,-188.06" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="611.89,-191.15 618.07,-182.55 607.87,-185.42 611.89,-191.15" style=""/> </g> <!-- 9 --> <g id="node10" class="node" pointer-events="visible" data-name="9"> -<path fill="none" stroke="#70d856" stroke-width="2" d="M647.75,-180C647.75,-180 548.25,-180 548.25,-180 542.25,-180 536.25,-174 536.25,-168 536.25,-168 536.25,-156 536.25,-156 536.25,-150 542.25,-144 548.25,-144 548.25,-144 647.75,-144 647.75,-144 653.75,-144 659.75,-150 659.75,-156 659.75,-156 659.75,-168 659.75,-168 659.75,-174 653.75,-180 647.75,-180" style=""/> -<text text-anchor="middle" x="598" y="-159" font-family="sans" font-size="10.00" style="">genometools_on_raw_data</text> +<path fill="none" stroke="#56d863" stroke-width="2" d="M380.75,-180C380.75,-180 281.25,-180 281.25,-180 275.25,-180 269.25,-174 269.25,-168 269.25,-168 269.25,-156 269.25,-156 269.25,-150 275.25,-144 281.25,-144 281.25,-144 380.75,-144 380.75,-144 386.75,-144 392.75,-150 392.75,-156 392.75,-156 392.75,-168 392.75,-168 392.75,-174 386.75,-180 380.75,-180" style=""/> +<text text-anchor="middle" x="331" y="-159" font-family="sans" font-size="10.00" style="">genometools_on_raw_data</text> </g> <!-- 9->6 --> -<g id="edge11" class="edge" data-name="9->6"> +<g id="edge16" class="edge" data-name="9->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M538.3,-143.06C500.91,-131.83 453.19,-117.49 417.29,-106.71" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="418.6,-103.45 408.02,-103.92 416.59,-110.15 418.6,-103.45" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M353.71,-143.34C364.08,-135.27 376.55,-125.57 387.93,-116.72" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="389.85,-119.66 395.59,-110.76 385.55,-114.14 389.85,-119.66" style=""/> </g> <!-- 10->6 --> -<g id="edge13" class="edge" data-name="10->6"> +<g id="edge8" class="edge" data-name="10->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M502.6,-287.1C509.7,-255.14 519.53,-187.56 489,-144 472.54,-120.52 443.46,-107.38 417.59,-100.05" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="418.71,-96.73 408.15,-97.65 416.98,-103.51 418.71,-96.73" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M696.25,-287.21C704.71,-254.52 717.07,-184.55 682,-144 655.39,-113.24 542.11,-99.73 473.84,-94.29" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="474.49,-90.82 464.25,-93.56 473.96,-97.8 474.49,-90.82" style=""/> </g> <!-- 11->6 --> -<g id="edge9" class="edge" data-name="11->6"> +<g id="edge15" class="edge" data-name="11->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M263.28,-215.22C277.76,-197.21 300.84,-168.59 321,-144 327.82,-135.68 335.29,-126.65 342.1,-118.45" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="344.64,-120.87 348.34,-110.95 339.25,-116.4 344.64,-120.87" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M421,-215.02C421,-191.54 421,-150.11 421,-121.64" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="424.5,-121.87 421,-111.87 417.5,-121.87 424.5,-121.87" style=""/> +</g> +<!-- 11->12 --> +<g id="edge22" class="edge" data-name="11->12"> + +<path fill="none" stroke="grey" stroke-width="2" d="M436.14,-215.34C442.56,-207.85 450.19,-198.95 457.32,-190.63" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="459.84,-193.07 463.69,-183.2 454.52,-188.51 459.84,-193.07" style=""/> </g> <!-- 12->6 --> -<g id="edge16" class="edge" data-name="12->6"> +<g id="edge11" class="edge" data-name="12->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M365,-143.34C365,-136.75 365,-129.08 365,-121.67" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="368.5,-121.93 365,-111.93 361.5,-121.93 368.5,-121.93" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M465.86,-143.34C459.44,-135.85 451.81,-126.95 444.68,-118.63" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="447.48,-116.51 438.31,-111.2 442.16,-121.07 447.48,-116.51" style=""/> </g> <!-- 13->6 --> -<g id="edge15" class="edge" data-name="13->6"> +<g id="edge10" class="edge" data-name="13->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M328.04,-215.16C324.76,-196.86 321.68,-167.79 329,-144 331.68,-135.29 336.33,-126.72 341.42,-119.1" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="344.16,-121.29 347.23,-111.15 338.51,-117.16 344.16,-121.29" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M618.29,-147.35C615.52,-146.15 612.72,-145.02 610,-144 564.82,-127.13 511.84,-112.72 473.49,-103.2" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="474.56,-99.86 464.02,-100.88 472.9,-106.66 474.56,-99.86" style=""/> </g> <!-- 14->6 --> -<g id="edge14" class="edge" data-name="14->6"> +<g id="edge12" class="edge" data-name="14->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M231.53,-145.64C255.38,-135.88 286.59,-123.1 312.83,-112.36" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="314.15,-115.6 322.08,-108.57 311.5,-109.12 314.15,-115.6" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M168.42,-215.15C160.36,-195.49 151.75,-164.01 168,-144 192.73,-113.55 301.7,-99.97 368.3,-94.42" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="368.31,-97.93 378,-93.65 367.75,-90.95 368.31,-97.93" style=""/> </g> -<!-- 15->14 --> -<g id="edge24" class="edge" data-name="15->14"> +<!-- 15->6 --> +<g id="edge9" class="edge" data-name="15->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M149.14,-215.34C155.56,-207.85 163.19,-198.95 170.32,-190.63" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="172.84,-193.07 176.69,-183.2 167.52,-188.51 172.84,-193.07" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M251.62,-146.97C254.45,-145.96 257.27,-144.95 260,-144 296.18,-131.37 337.2,-117.87 368.79,-107.66" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="369.51,-111.11 377.95,-104.71 367.36,-104.44 369.51,-111.11" style=""/> </g> -<!-- 16->6 --> -<g id="edge10" class="edge" data-name="16->6"> +<!-- 16->15 --> +<g id="edge27" class="edge" data-name="16->15"> -<path fill="none" stroke="grey" stroke-width="2" d="M740.31,-287.13C734.21,-253.34 716.04,-180.23 669,-144 630.53,-114.37 494.13,-100.13 417.76,-94.34" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="418.39,-90.88 408.16,-93.64 417.88,-97.86 418.39,-90.88" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M252.13,-215.34C246.8,-208.01 240.47,-199.34 234.53,-191.18" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="237.54,-189.37 228.83,-183.35 231.89,-193.49 237.54,-189.37" style=""/> </g> -<!-- 17 --> -<g id="node18" class="node" pointer-events="visible" data-name="17"> +<!-- 17->6 --> +<g id="edge17" class="edge" data-name="17->6"> + +<path fill="none" stroke="grey" stroke-width="2" d="M793.12,-287.08C782.89,-253.22 755.86,-179.98 705,-144 668.57,-118.23 545.59,-102.68 473.97,-95.59" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="474.53,-92.13 464.24,-94.66 473.86,-99.1 474.53,-92.13" style=""/> +</g> +<!-- 18 --> +<g id="node19" class="node" pointer-events="visible" data-name="18"> -<path fill="none" stroke="#56d873" stroke-width="2" d="M758,-396C758,-396 728,-396 728,-396 722,-396 716,-390 716,-384 716,-384 716,-372 716,-372 716,-366 722,-360 728,-360 728,-360 758,-360 758,-360 764,-360 770,-366 770,-372 770,-372 770,-384 770,-384 770,-390 764,-396 758,-396" style=""/> -<text text-anchor="middle" x="743" y="-375" font-family="sans" font-size="10.00" style="">meryl</text> +<path fill="none" stroke="#d86e56" stroke-width="2" d="M813,-396C813,-396 783,-396 783,-396 777,-396 771,-390 771,-384 771,-384 771,-372 771,-372 771,-366 777,-360 783,-360 783,-360 813,-360 813,-360 819,-360 825,-366 825,-372 825,-372 825,-384 825,-384 825,-390 819,-396 813,-396" style=""/> +<text text-anchor="middle" x="798" y="-375" font-family="sans" font-size="10.00" style="">meryl</text> </g> -<!-- 17->16 --> -<g id="edge27" class="edge" data-name="17->16"> +<!-- 18->17 --> +<g id="edge30" class="edge" data-name="18->17"> -<path fill="none" stroke="grey" stroke-width="2" d="M743,-359.34C743,-352.75 743,-345.08 743,-337.67" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="746.5,-337.93 743,-327.93 739.5,-337.93 746.5,-337.93" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M798,-359.34C798,-352.75 798,-345.08 798,-337.67" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="801.5,-337.93 798,-327.93 794.5,-337.93 801.5,-337.93" style=""/> </g> -<!-- 18->0 --> -<g id="edge4" class="edge" data-name="18->0"> +<!-- 19->6 --> +<g id="edge14" class="edge" data-name="19->6"> -<path fill="none" stroke="grey" stroke-width="2" d="M30.9,-143.01C36.17,-122.91 47.49,-90.75 69,-72 105.13,-40.51 159.91,-27.72 197.24,-22.53" style=""/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="197.63,-26.01 207.13,-21.31 196.78,-19.06 197.63,-26.01" style=""/> +<path fill="none" stroke="grey" stroke-width="2" d="M35.75,-287.07C53.29,-253.18 96.17,-179.9 155,-144 220.87,-103.81 311.4,-93.35 368.03,-91.04" style=""/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="368.06,-94.54 377.94,-90.72 367.83,-87.54 368.06,-94.54" style=""/> </g> </g> </svg> \ No newline at end of file diff --git a/doc/software_list.md b/doc/software_list.md index 5414911..d032740 100644 --- a/doc/software_list.md +++ b/doc/software_list.md @@ -5,7 +5,7 @@ Images are automatically pulled by Snakemake on first run and stored in the proj ## 01. Assembly Assembly: -- [hifiasm](https://github.com/chhylp123/hifiasm) 0.19.6 +- [hifiasm](https://github.com/chhylp123/hifiasm) 0.24.0-r703 - [YAK](https://github.com/lh3/yak) 0.1 Haplotigs and Overlaps Purging: diff --git a/workflow/Snakefile b/workflow/Snakefile index 51c0986..50c7b6b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -46,11 +46,12 @@ rule hifiasm: benchmark: os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hifiasm_benchmark.txt") threads: 20 + priority: 2 resources: mem_mb=250000, time="80:00:00" container: - f"{container_registry}/hifiasm:0.19.6" + f"{container_registry}/hifiasm-yak:latest" shell: """ ./workflow/scripts/hifiasm_call.sh {params.mode} {params.purge_force} {threads} {input.reads} {params.run_1} {params.run_2} {params.prefix} && diff --git a/workflow/scripts/hifiasm_call.sh b/workflow/scripts/hifiasm_call.sh index 03a9ff4..42686c0 100755 --- a/workflow/scripts/hifiasm_call.sh +++ b/workflow/scripts/hifiasm_call.sh @@ -42,11 +42,11 @@ case "$MODE" in trio) echo "Asm4pg -> Hifiasm called in trio mode..." echo "Asm4pg -> Generating yak file for parent 1 ($RUN_1)" - yak count -k31 -b37 -t16 -o ${PREFIX}/yak/parent1.yak ${RUN_1} + yak count -k31 -b37 -t16 -o ${PREFIX}_parent1.yak ${RUN_1} echo "Asm4pg -> Generating yak file for parent 1 ($RUN_2)" - yak count -k31 -b37 -t16 -o ${PREFIX}/yak/parent2.yak ${RUN_2} + yak count -k31 -b37 -t16 -o ${PREFIX}_parent2.yak ${RUN_2} echo "Asm4pg -> Running hifiasm in trio mode..." - hifiasm -o ${PREFIX} -t ${THREADS} -1 ${PREFIX}/yak/parent1.yak -2 ${PREFIX}/yak/parent2.yak ${INPUT} + hifiasm -o ${PREFIX} -t ${THREADS} -1 ${PREFIX}_parent1.yak -2 ${PREFIX}_parent2.yak ${INPUT} echo "Asm4pg -> Renaming hifiasm output files" mv ${PREFIX}.dip.hap1.p_ctg.gfa ${PREFIX}.bp.hap1.p_ctg.gfa mv ${PREFIX}.dip.hap2.p_ctg.gfa ${PREFIX}.bp.hap2.p_ctg.gfa -- GitLab From 5710dc0d6652601f52033bc3224f5b61b74d4811 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-2.cluster> Date: Fri, 17 Jan 2025 11:44:12 +0100 Subject: [PATCH 172/178] Add param list --- .config/masterconfig.yaml | 41 +++++++++++++++------------------------ 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 1160fb9..c235e6c 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -4,36 +4,27 @@ # Here is an example of mininmal configuration. Refer to the documentation for all options samples: - run2: + run1: fasta_gz: small_example.fasta.gz - mode: default - run_purge_dups: True - assembly_purge_force: 2 - busco_lineage: eudicots_odb10 - reference_genome: small_ref.fasta.gz - run_ragtag: True -# run3: -# fasta_gz: small_example.fasta.gz -# mode: hi-c -# r1: small_example.fasta.gz -# r2: small_example.fasta.gz -# run4: -# fasta_gz: small_example.fasta.gz -# mode: hi-c -# r1: small_example.fasta.gz -# r2: small_example.fasta.gz -# run_ragtag: True -# run_purge_dups: True -# reference_genome: small_ref.fasta.gz -# run1: -# fasta_gz: small_example.fasta.gz -# mode: default -# run_purge_dups: False # You can modify the output path if you need to output_dir: "results/" +# Path to your output dir output_dir_pdw: /mnt/cbib/pangenoak_trials/GenomAsm4pg/ # This container registry will be used to download singularity images -container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" \ No newline at end of file +container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" + +# List of parameters : +# fasta_gz: Your reads (mandatory) +# mode: [default, hi-c, trio] (default: default) +# r1: The run1/parent1 read file +# r2: The run2/parent2 read file +# run_purge_dups: [True, False] (default: False) +# run_ragtag: [True, False] (default: False) +# reference_genome: +# busco_lineage: (default: eukaryota_odb10) +# ploidy: (default: 2) +# assembly_purge_force: [1-3] (default: 3) +# kmer_size: (default: 21) \ No newline at end of file -- GitLab From c156f07ce9989cbdebe70606b100441d86818a33 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-2.cluster> Date: Fri, 17 Jan 2025 11:44:36 +0100 Subject: [PATCH 173/178] Modifiy formating --- workflow/scripts/report.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/scripts/report.Rmd b/workflow/scripts/report.Rmd index 21a0b36..d5cbb86 100755 --- a/workflow/scripts/report.Rmd +++ b/workflow/scripts/report.Rmd @@ -44,9 +44,11 @@ cat(readLines(snakemake@input[["genometools_on_raw_data"]]), sep = '\n') ## Global assembly QC QUAST Cumulative Plot +  QUAST Nx Plot +  ## QC on final assembly -- GitLab From 81d56ac81fe7fa6b37b0eb343524bd7363b926bc Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-2.cluster> Date: Fri, 17 Jan 2025 11:48:45 +0100 Subject: [PATCH 174/178] Add more inforamtion in comments --- .config/masterconfig.yaml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index c235e6c..0255267 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -7,15 +7,17 @@ samples: run1: fasta_gz: small_example.fasta.gz +# FULL path to your output dir (mandatory) +output_dir_pdw: /home/lpiat/work/GenomAsm4pg/ -# You can modify the output path if you need to -output_dir: "results/" -# Path to your output dir -output_dir_pdw: /mnt/cbib/pangenoak_trials/GenomAsm4pg/ -# This container registry will be used to download singularity images +# ---- + container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" +# You can modify the output path if you need to +output_dir: "results/" + # List of parameters : # fasta_gz: Your reads (mandatory) # mode: [default, hi-c, trio] (default: default) @@ -23,8 +25,10 @@ container_registry: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg" # r2: The run2/parent2 read file # run_purge_dups: [True, False] (default: False) # run_ragtag: [True, False] (default: False) -# reference_genome: +# reference_genome: Assembled fasta.gz genome # busco_lineage: (default: eukaryota_odb10) + +# Advanced parameters (we advise keeping default vaules) # ploidy: (default: 2) # assembly_purge_force: [1-3] (default: 3) # kmer_size: (default: 21) \ No newline at end of file -- GitLab From f79548ab849ec6637919aeb9e32946378e7b0811 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-2.cluster> Date: Fri, 17 Jan 2025 12:05:31 +0100 Subject: [PATCH 175/178] Remove typos in documentation --- README.md | 22 ++++++++++------------ doc/documentation.md | 2 +- doc/going_further.md | 6 +++--- doc/known_errors.md | 12 ++++++------ 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index d88229a..888193e 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ -# TODO, ADAPT THIS OBSOLOETE README - # <A HREF="https://forgemia.inra.fr/asm4pg/GenomAsm4pg"> asm4pg </A> -An automatic and reproducible genome assembly workflow for pangenomic applications using PacBio HiFi data. +This is an automatic and reproducible genome assembly workflow for pangenomic applications using PacBio HiFi data. This workflow uses [Snakemake](https://snakemake.readthedocs.io/en/stable/) to quickly assemble genomes with a HTML report summarizing obtained assembly stats. @@ -22,7 +20,7 @@ This workflow uses [Snakemake](https://snakemake.readthedocs.io/en/stable/) to q └── masterconfig.yaml ``` -## Requirement +## Requirements Miniforge (Snakemake), Singularity/Apptainer ## How to Use ### 1. Set up @@ -30,7 +28,7 @@ Clone the Git repository ```bash git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git && cd GenomAsm4pg ``` -> All other tools will be ran in Singularity/Apptainer images automaticly downloaded by Snakemake. Total size of the images is ~5.5G +> All other tools will be run in Singularity/Apptainer images automatically downloaded by Snakemake. Total size of the images is ~5.5G ### 2. Configure the pipeline - Edit the `masterconfig` file in the `.config/` directory with your sample information. @@ -38,11 +36,11 @@ git clone https://forgemia.inra.fr/asm4pg/GenomAsm4pg.git && cd GenomAsm4pg #### <ins>A. On a HPC</ins> - Edit `job.sh` with path to the modules `Singularity/Apptainer`, `Miniforge` -- Provide and environement with `Snakemake` and `snakemake-executor-plugin-slurmin` in `job.sh`, under `source activate wf_env`, you can create it like this : +- Provide and environment with `Snakemake` and `snakemake-executor-plugin-slurmin` in `job.sh`, under `source activate wf_env`, you can create it like this : ```bash conda create -n wf_env -c conda-forge -c bioconda snakemake=8.4.7 snakemake-executor-plugin-slurm ``` -> Use Miniforge with the conda-forge chanel, see why [here](https://science-ouverte.inrae.fr/fr/offre-service/fiches-pratiques-et-recommandations/quelles-alternatives-aux-fonctionnalites-payantes-danaconda) (french) +> Use Miniforge with the conda-forge channel, see why [here](https://science-ouverte.inrae.fr/fr/offre-service/fiches-pratiques-et-recommandations/quelles-alternatives-aux-fonctionnalites-payantes-danaconda) (french) - Add the log directory for SLURM ```bash mkdir slurm_logs @@ -52,9 +50,9 @@ mkdir slurm_logs sbatch job.sh dry # Check for warnings sbatch job.sh run # Then ``` -> **Nb 1:** If the your account name cant be automaticly determined, add it in the `.config/snakemake/profiles/slurm/config.yaml` file. -#### <ins>B. Localy</ins> -- Make sure you have Snakemake and Singularity/Apptainer instaled +> **Nb 1:** If your account name can't be automatically determined, add it in the `.config/snakemake/profiles/slurm/config.yaml` file. +#### <ins>B. Locally</ins> +- Make sure you have Snakemake and Singularity/Apptainer installed - Run the workflow : ```bash ./local_run dry # Check for warnings @@ -62,7 +60,7 @@ sbatch job.sh run # Then ``` ## Using the full potential of the workflow : -Asm4pg as many options, if you wish to modify the default values and now more about the workflow, please refer to the [documentation](doc/documentation.md) +Asm4pg has many options. If you wish to modify the default values and know more about the workflow, please refer to the [documentation](doc/documentation.md) ## How to cite asm4pg? @@ -72,5 +70,5 @@ We are currently writing a publication about asm4pg. Meanwhile, if you use the p The content of this repository is licensed under <A HREF="https://choosealicense.com/licenses/gpl-3.0/">(GNU GPLv3)</A> ## Contacts -For any troubleshouting, issue or feature suggestion, please use the issue tab of this repository. +For any troubleshooting, issue or feature suggestion, please use the issue tab of this repository. For any other question or if you want to help in developing asm4pg, please contact Ludovic Duvaux at ludovic.duvaux@inrae.fr diff --git a/doc/documentation.md b/doc/documentation.md index d56311f..6e8850b 100644 --- a/doc/documentation.md +++ b/doc/documentation.md @@ -18,4 +18,4 @@ You may encounter [these errors](doc/known_errors.md). ## Software -Here is a list of [software used in the workflow](doc/software_list.md) \ No newline at end of file +Here is a list of [software used in the workflow](doc/software_list.md). diff --git a/doc/going_further.md b/doc/going_further.md index fd4edda..2f8e2ac 100644 --- a/doc/going_further.md +++ b/doc/going_further.md @@ -12,7 +12,7 @@ Usage: `job.sh/local_run.sh [dry|run|dag|rulegraph|unlock]` ## 02. Workflow Options -Inside the `./.config/marsterconfig.yaml` file, you can add more options. +Inside the `./.config/masterconfig.yaml` file, you can add more options. Here are all the options and their default values: - `fasta_gz`: Your reads (mandatory) @@ -25,10 +25,10 @@ Here are all the options and their default values: - `run_ragtag`: [True, False] If set to true, the workflow will run RagTag and produce a scaffold of the assemblies (default: False) - `reference_genome`: The reference genome used for QUAST and RagTag scaffolding -âš ï¸ Advanced options (use only if you have read the tools' documentation; we strongly advise keeping default values): +âš ï¸ Advanced options (use only if you have read the tools' documentation; we strongly recommend keeping default values): - `assembly_purge_force`: [1-3] The purge level of Hifiasm `-l` parameter, full description [here](https://hifiasm.readthedocs.io/en/latest/parameter-reference.html) (default: 3) -- `kmer_size`: The sizes of the kmers used for QC steps (default: 21) +- `kmer_size`: The size of the kmers used for QC steps (default: 21) ## 03. Example Configurations diff --git a/doc/known_errors.md b/doc/known_errors.md index b648666..1f42a3d 100644 --- a/doc/known_errors.md +++ b/doc/known_errors.md @@ -2,21 +2,21 @@ ## BUSCO Rule Failures -During first run, multiple simultaneous BUSCO lineage downloads may cause job conflicts. Simply rerun the workflow after completion to resolve this. +During the first run, multiple simultaneous BUSCO lineage downloads may cause job conflicts. Simply rerun the workflow after completion to resolve this issue. ## Snakemake Locked Directory -If workflow rerun fails after job cancellation: -1. Run `job.sh/local_run.sh unlock` -2. Then rerun the workflow normaly +If the workflow rerun fails after job cancellation: +1. Run `job.sh/local_run.sh unlock`. +2. Then rerun the workflow normally. ## HPC Problems The `job.sh` script is incompatible with HPCs that restrict job nesting. -For older SLURM versions, you may encounter: `srun: unrecognized option '--cpu-bind=q'`. This is a [known SLURM/Snakemake issue](https://github.com/snakemake/snakemake/issues/2071) requiring SLURM update. +For older SLURM versions, you may encounter: `srun: unrecognized option '--cpu-bind=q'`. This is a [known SLURM/Snakemake issue](https://github.com/snakemake/snakemake/issues/2071) that requires a SLURM update. -Temporary workaround for thoses issues include using `local_run.sh` with sbatch: +Temporary workarounds for these issues include using `local_run.sh` with sbatch: ```bash module load Singularity source activate wf_env -- GitLab From 3dfc393be42def8d9d8695ea92d05678ae87d43f Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-2.cluster> Date: Fri, 17 Jan 2025 13:21:40 +0100 Subject: [PATCH 176/178] add input conversion script to readme --- README.md | 6 ++++++ workflow/scripts/input_conversion.sh | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) mode change 100644 => 100755 workflow/scripts/input_conversion.sh diff --git a/README.md b/README.md index 888193e..0edd717 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,12 @@ sbatch job.sh run # Then ./local_run job.sh run # Then ``` +## Input Conversion +Currently, asm4pg requires `fasta.gz` files. To convert your `fastq` or `bam` files to this format, you can use the following tools: +```bash +./workflow/scripts/input_conversion.sh -i <input_file> -o <output_file> +``` + ## Using the full potential of the workflow : Asm4pg has many options. If you wish to modify the default values and know more about the workflow, please refer to the [documentation](doc/documentation.md) diff --git a/workflow/scripts/input_conversion.sh b/workflow/scripts/input_conversion.sh old mode 100644 new mode 100755 index 189d5f0..1dcd6ec --- a/workflow/scripts/input_conversion.sh +++ b/workflow/scripts/input_conversion.sh @@ -29,7 +29,7 @@ fi # Ensure necessary tools are installed if ! command -v samtools &> /dev/null || ! command -v seqtk &> /dev/null; then - echo "Error: 'samtools' and 'seqtk' are required but not installed." + echo "Error: 'samtools' and 'seqtk' are required but not installed/loaded." exit 1 fi -- GitLab From 5dde53ff45d84241362dbab7b19dbd748c4bec41 Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-2.cluster> Date: Mon, 3 Feb 2025 09:36:11 +0100 Subject: [PATCH 177/178] add benchmark for all tools --- workflow/Snakefile | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 50c7b6b..bd16739 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -44,7 +44,7 @@ rule hifiasm: raw_out_hap1 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap1.p_ctg.gfa"), raw_out_hap2 = os.path.join(output_dir, "{sample}_results", "01_raw_assembly","{sample}.bp.hap2.p_ctg.gfa") benchmark: - os.path.join(output_dir, "{sample}_results", "01_raw_assembly", "{sample}_hifiasm_benchmark.txt") + os.path.join(output_dir, "{sample}_results", "benchmark", "hifiasm_benchmark.txt") threads: 20 priority: 2 resources: @@ -73,6 +73,7 @@ rule pigz_gfa_to_fasta: resources: mem_mb=25000, time="80:00:00" + benchmark: os.path.join(output_dir, "{sample}_results", "benchmark", "pigz_hap{n}_benchmark.txt") container: f"{container_registry}/pigz" shell: @@ -93,6 +94,7 @@ rule haplotigs_handling: out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}"), purge_dups_option = get_purge_bool threads: 20 + benchmark: os.path.join(output_dir, "{sample}_results", "benchmark", "haplotigs_handling_hap{n}_benchmark.txt") resources: mem_mb=100000, time="80:00:00" @@ -112,6 +114,7 @@ rule unpigz_to_fasta: container: f"{container_registry}/pigz" threads: 4 + benchmark: os.path.join(output_dir, "{sample}_results", "benchmark", "unpigz_hap{n}_benchmark.txt") resources: mem_mb=25000, time="80:00:00" @@ -127,6 +130,8 @@ rule cutoffs_graph: graph = os.path.join(output_dir, "{sample}_results", "02_final_assembly","hap{n}","cutoffs_graph_hap{n}.png") params: out_dir = os.path.join(output_dir, "{sample}_results", "02_final_assembly", "hap{n}") + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "cutoffs_graph_hap{n}_benchmark.txt") resources: mem_mb=10000, time="80:00:00" @@ -145,6 +150,8 @@ rule genometools_on_raw_data: resources: mem_mb=100000, time="80:00:00" + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "genometools_raw_benchmark.txt") threads: 4 container: f"{container_registry}/genometools1.5.9" @@ -157,6 +164,8 @@ use rule genometools_on_raw_data as genometools_on_assembly with: rules.haplotigs_handling.output.hap output: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "{sample}_hap{n}_genometools_stats.txt") + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "genometools_hap{n}_benchmark.txt") priority: 0 # BUSCO stats on assembly @@ -170,6 +179,8 @@ rule busco_hap1: lineage=get_busco_lin, sample="{sample}_hap1" threads: 20 + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "busco_hap1_benchmark.txt") resources: mem_mb=100000, time="80:00:00" @@ -189,6 +200,8 @@ use rule busco_hap1 as busco_hap2 with: other = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco", "busco_{sample}_hap1.txt") output: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap2", "busco", "busco_{sample}_hap2.txt") + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "busco_hap2_benchmark.txt") params: prefix = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap1", "busco"), lineage=get_busco_lin, @@ -201,6 +214,8 @@ rule find_telomeres: output: os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "telomeres", "{sample}_hap{n}_telomeres.txt") threads: 4 + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "find_telomeres_hap{n}_benchmark.txt") resources: mem_mb=40000, time="80:00:00" @@ -219,6 +234,8 @@ rule jellyfish: params: km_size = get_kmer_size threads: 10 + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "jellyfish_benchmark.txt") resources: mem_mb=80000, time="80:00:00" @@ -246,6 +263,8 @@ rule genomescope: resources: mem_mb=40000, time="80:00:00" + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "genomescope_benchmark.txt") container: f"{container_registry}/genomescope2.0" shell: @@ -262,6 +281,8 @@ rule kat: out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "hap{n}", "katplot", "{sample}_hap{n}"), km_size = get_kmer_size threads: 4 + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "kat_hap{n}_benchmark.txt") resources: mem_mb=40000, time="80:00:00" @@ -283,6 +304,8 @@ rule meryl: params: km_size = get_kmer_size threads: 20 + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "meryl_benchmark.txt") resources: mem_mb=60000, time="80:00:00" @@ -304,6 +327,8 @@ rule merqury: prefix = "{sample}_merqury", out_dir = os.path.join(output_dir, "{sample}_results", "04_assembly_qc", "merqury") threads: 20 + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "merqury_benchmark.txt") resources: mem_mb=60000, time="80:00:00" @@ -332,6 +357,8 @@ rule LTR_finder: resources: mem_mb=60000, time="80:00:00" + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "ltr_finder_hap{n}_benchmark.txt") container: f"{container_registry}/ltr_finder:latest" shell: @@ -353,6 +380,8 @@ rule LTR_retriever: resources: mem_mb=250000, time="50:00:00" + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "ltr_retriever_hap{n}_benchmark.txt") container: f"{container_registry}/ltr_retriever:3.0.1" shell: @@ -383,6 +412,8 @@ rule scafolding: resources: mem_mb = 80000, time="80:00:00" + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "scafolding_hap{n}_benchmark.txt") container: f"{container_registry}/ragtag:2.0.1" shell: @@ -409,6 +440,8 @@ rule quast: container: f"{container_registry}/staphb/quast:5.2.0" threads: 20 + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "quast_benchmark.txt") resources: mem_mb=250000, time="80:00:00" @@ -464,7 +497,9 @@ rule generate_report: resources: mem_mb=10000, time="80:00:00" + benchmark: + os.path.join(output_dir, "{sample}_results", "benchmark", "generate_report_benchmark.txt") container: f"{container_registry}/rmarkdown4.0.3" script: - "./scripts/report.Rmd" \ No newline at end of file + "./scripts/report.Rmd" -- GitLab From 19733e1c0e387cf418753b9fc43c47ae1654977e Mon Sep 17 00:00:00 2001 From: Lucien Piat <lpiat@bb8-2.cluster> Date: Mon, 3 Feb 2025 09:37:27 +0100 Subject: [PATCH 178/178] Homogenize quast commant between pan1c and asm4pg --- workflow/scripts/quast_call.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/quast_call.sh b/workflow/scripts/quast_call.sh index 5da4467..aa11401 100755 --- a/workflow/scripts/quast_call.sh +++ b/workflow/scripts/quast_call.sh @@ -26,7 +26,7 @@ fi # Build the quast command echo "Asm4pg -> Building the QUAST command..." -quast_cmd="python /quast-5.2.0/metaquast.py --threads 20 --large --no-check --no-snps --no-icarus --plots-format png " +quast_cmd="python /quast-5.2.0/metaquast.py --threads 20 --no-read-stats --large --no-snps --no-icarus --plots-format png " if [ "$REFERENCE_GENOME" != "None" ]; then echo " - Reference genome specified: $REFERENCE_GENOME" quast_cmd+="--reference $REFERENCE_GENOME " -- GitLab