diff --git a/README.md b/README.md index 33b4fbfd8e747dc4ea1dde4b078835c50155ac1b..ec0e7444668d34ffafc42e0831676459ee0bbc05 100644 --- a/README.md +++ b/README.md @@ -19,5 +19,27 @@ An example of the params.config and fastqScreen are available in the assets fold Example of a basic command line the launch the pipeline is (from the nextflow folder) : ```bash -sbatch -J nf-illumina_BHNKY7DRX2_1 -p wflowq -t 3-00 --mem 5GB --wrap="module load bioinfo/Nextflow-v21.04.1; cd /home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1673933427_10x/nextflow; nextflow run /work/sbsuser/test/jules/VisualStudioSources/wf-illumina-nf/main.nf -profile prod -ansi-log false" -``` \ No newline at end of file +sbatch -J nf-illumina_BHNKY7DRX2_1 -p wflowq -t 3-00 --mem 5GB --wrap="module load bioinfo/Nextflow-v21.04.1; cd /home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1673933427_10x/nextflow; nextflow run /work/sbsuser/test/jules/VisualStudioSources/wf-illumina-nf/main.nf -profile prod -ansi-log false -params-file ../params.yml" +``` + +Tha YAML parameter file must looks like : +``` +inputdir: "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/211129_A00318_0259_AHNMTTDSX2_Lane1_1638345606_dna" +project: 'GwOAK_small' +is_multiplex: true +data_nature: "DNA" +pairedEnd: true +split_reads: true # // ???? +reference_genome: "/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Quercus_robur/genome/GCA_900291515.1/BWA/GCA_900291515.1_Q_robur_v1_genomic.fna" +addBankForConta: "" +run_name: "ContaComparison" +sequencer: "NovaSeq" +run_date: "2022" +machine_id: "NOVA" +fc_id: "HNMTTDSX2" +lane: "1" +demux_uniqueness: "1638345606" +``` + + +NB : for the moment, the case of multi-projects lane is not managed ! \ No newline at end of file diff --git a/assets/begin_template.txt b/assets/begin_template.txt new file mode 100644 index 0000000000000000000000000000000000000000..1dd968ddbd5944beffebd23aa3c68fbff7e06d2a --- /dev/null +++ b/assets/begin_template.txt @@ -0,0 +1,29 @@ +---------------------------------------------------------------------------------- +================================================== +------------------------------- get-nf workflow ---------------------------- + I L L U M I N A - N F P I P E L I N E + V$version +================================================== +---------------------------------------------------------------------------------- + +NextFlow Run Name : $wfRunName + +Demultiplexing is over, the analysis started at $dateStart. + +The analysis of the following sequencing is running : +- Project : $project +- Run : $run_name +- Data : $data_nature +- Sequencer : $sequencer +- FlowCell : $flowcell +- Lane : $lane +- Directory : $directory + + +The command used to launch the workflow was as follows : + + $commandLine + +--- +$name +$homePage diff --git a/assets/email_template.txt b/assets/email_template.txt deleted file mode 100644 index f7ef189e45abfdd03dbb3d2aa84fd2b725299798..0000000000000000000000000000000000000000 --- a/assets/email_template.txt +++ /dev/null @@ -1,35 +0,0 @@ ----------------------------------------------------- - GeT/template v${version} ----------------------------------------------------- - -Run Name: $runName - -<% if (success){ - out << "## GeT/template execution completed successfully! ##" -} else { - out << """#################################################### -## GeT/template execution completed unsuccessfully! ## -#################################################### -The exit status of the task that caused the workflow execution to fail was: $exitStatus. -The full error message was: - -${errorReport} -""" -} %> - - -The workflow was completed at $dateComplete (duration: $duration) - -The command used to launch the workflow was as follows: - - $commandLine - - - -Pipeline Configuration: ------------------------ -<% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> - --- -GeT/template -https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf diff --git a/assets/final_email_template.txt b/assets/final_email_template.txt new file mode 100644 index 0000000000000000000000000000000000000000..5221db87d4c952d718146f6461efb6d5fe850f97 --- /dev/null +++ b/assets/final_email_template.txt @@ -0,0 +1,43 @@ +---------------------------------------------------------------------------------- +================================================== +------------------------------- get-nf workflow ---------------------------- + I L L U M I N A - N F P I P E L I N E + V$version +================================================== +---------------------------------------------------------------------------------- + +NextFlow Run Name : $runName +Project : $project +Run : $run + +<% if (success){ + out << """## GeT-nextflow-NGL-Bi/wf-Illumina-nf execution completed successfully! ## + Check your analyzes on NGL-Bi : http://esitoul-prod.toulouse.inra.fr:9096/ +""" +} else { + out << """############################################### +## GeT-nextflow-NGL-Bi/wf-Illumina-nf execution completed unsuccessfully! ## +############################################### +The exit status of the task that caused the workflow execution to fail was: $exitStatus. +The full error message was: + +${errorReport} +""" +} %> + + +The workflow was completed at $dateComplete (duration: $duration) + +The command used to launch the workflow was as follows: + + $commandLine + + + +Pipeline Configuration: +----------------------- +<% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> + +--- +$name +$homePage diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 528c27ced03f492edc020287268965ec58831569..4338c5bbb6a2dff99cf9f1825839e43ddf247576 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -16,10 +16,9 @@ show_analysis_time: False thousandsSep_format: " " ## Sample name formatting -extra_fn_clean_trim: +extra_fn_clean_exts: - "_filtered" - "_unmerged" - - "_unmerged_stats" - "_flagstat" - "_subset" - "_screen" diff --git a/assets/params.config_example b/assets/params.config_example index 0bd525efeaddf04e134582ed908a180048c59615..b197245d0d7f314a52dd0c344eabb75e935cfa0b 100644 --- a/assets/params.config_example +++ b/assets/params.config_example @@ -3,17 +3,17 @@ params { samplesheet = inputdir+'/SampleSheet.csv' project = 'MAGICs' data=inputdir+'/'+project - isMultiplex = true - dataNature = 'DNA' + is_multiplex = true + data_nature = 'DNA' //pairedEnd = true - splitReads = true - referenceGenome = '' + split_reads = true + reference_genome = '' addBankForConta = '' - runName='Test_10X' + run_name='Test_10X' sequencer='NovaSeq' run_date='230116' - machineID='NOVA' - fcID='BHNKY7DRX2' + machine_id='NOVA' + fc_id='BHNKY7DRX2' lane='1' - demuxUniqueness='1673933427' + demux_uniqueness='1673933427' } \ No newline at end of file diff --git a/assets/params.yml_example b/assets/params.yml_example new file mode 100644 index 0000000000000000000000000000000000000000..ab6728c6f0e9bbb24535213159ad8fac0f7fc4f1 --- /dev/null +++ b/assets/params.yml_example @@ -0,0 +1,20 @@ + +inputdir: "/home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1673933427_10x" +project: "MAGICs" +is_multiplex: true +data_nature: "DNA" +split_reads: true +species: "Mus musculus" +reference_genome: "" +reference_transcriptome: "" +run_name: "Test_10X" +description: "" +sequencer: "NovaSeq" +machine_id: "NOVA" +run_date: "230116" +fc_id: "BHNKY7DRX2" +fc_type: "Flowcell Standard - Lane 1" +lane: "1" +demux_uniqueness: "1673933427" +min_overlap: 100 +max_overlap: 230 \ No newline at end of file diff --git a/bin/DTM/Readme.md b/bin/DTM/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..fa2ad35610f20b9ae530b03e1513241121c8df25 --- /dev/null +++ b/bin/DTM/Readme.md @@ -0,0 +1,55 @@ +# DTM scripts +In this folder are some scripts to perform methodological and technological developments at GeT-PlaGe. +For the moment, the main.nf script is used to perform basic analyses. Then, specific analyses on coverage can be done using the `make_bedgraph.sh` and `circlize_v2.R` scripts. + +## Coverage comparison +Comparative coverage analysis by sample can be performed using `make_bedgraph.sh` and `circlize_v2.R`. +### 1/ Using **make_bedgraph.sh** +This bash script does: +1. renamming chromosomes/scaffolds using chrom.names file +2. BAM indexing +3. bedgraph file generating +4. removing of unwanted chromosomes/scaffolds + +This script takes three mandatory non-flagged inputs: +- path to BAM files folder +- path to chrom_names file +- pattern of unwanted chromosomes/scafolds (can be a void string) + +Example of command line to generate chrom_names file: +```bash +grep "^>" genome.fa | cut -d' ' -f1,8 | sed 's/>//' - | sed 's/,//' - | tr -s ' ' '\t' > chrom_names +``` +*NB: fields to keep in the cut command must be adapted for each genome.fa file. +The second column of the chrom_names file can be written by hand if needed.* +Example of chrom_names file content: +```bash +GK000076.1 1 +GK000077.1 2 +GK000078.1 3 +GK000079.1 4 +GK000080.1 5 +GK000081.1 6 +GK000082.1 7 +GK000083.1 8 +GK000084.1 9 +GK000085.1 10 +``` + +Example of sbatch command of `make_bedgraph.sh`: +```bash +sbatch -J bedgraph --array=1-6 make_bedgraph.sh ../samtools ../chrom_names "JANXI\|CM" +``` +*NB : `make_bedgraph.sh` is an array slurm script, it runs one time per BAM file. So, the `--array=` argument must contain the number of BAM files to analyze.* + +### 2/ Using **circlize_v2.R** +This R script make one circos plot for every input data. +It takes two mandatory non-flagged arguments : +- chunk_size +- list of bedgraph files (each file must be coma-space separated: `, `) + +*NB : use `ls -m *.bedgraph` to generate the well structured list of bedgraph files* +Example of sbatch command of `circlize_v2.R` : +```bash +sbatch -p wflowq -t 12:00:00 --mem-per-cpu=124GB -J circosplot--wrap="module load system/R-4.2.1_Miniconda3; Rscript circlize_v2.R 100000 'zeros_scaled_filtered_bacterie-100ng-1_S20_L004_R1_001_unmerged.bedgraph, zeros_scaled_filtered_bacterie-100ng-2_S21_L004_R1_001_unmerged.bedgraph'" +``` diff --git a/bin/extractInfoForDemuxStats.pl b/bin/extractInfoForDemuxStats.pl index 71218fc3d7e35bd8c5f9729df4c4f8c25ada85f5..8e879af83d1901c975a878c53a38b2eba492d626 100755 --- a/bin/extractInfoForDemuxStats.pl +++ b/bin/extractInfoForDemuxStats.pl @@ -113,6 +113,7 @@ foreach my $k (keys(%sample_info)) { $content.="$k\t$sample_info{$k}\n"; } +$projectName = $projectName eq "" ? 'noName' : $projectName; my $file2write = "$projectName.indexNumber"; open(my $fh, '>', $file2write) or exit 1; diff --git a/conf/base.config b/conf/base.config index 78238b18a04674914aa1596902df1d58cbd98590..85910c282e6090115c276d443effc2a2112099fb 100644 --- a/conf/base.config +++ b/conf/base.config @@ -5,31 +5,61 @@ System.out.println "Chargement des paramètres de base" // Fixed params params { // EMPTY INITIALISATION OF INPUT PARAMS - referenceGenome = '' - inputdir = "" + // General params outdir = "./" // base output directory for all analysis -} + inputdir = "" + project = "" + sequencer = "" + machine_id = "" + fc_id = "" + fc_type = "" + lane = "" + demux_uniqueness = "" + + data_nature = "" + species = "" + is_multiplex = false + + run_name = "" + run_date = "" + description = "" + split_reads = false + + // DNA / RNA params + reference_genome = "" + reference_transcriptome = "" -import java.text.SimpleDateFormat -SimpleDateFormat uniqueness_format = new SimpleDateFormat("yyyMMddHHmmss") + // Amplicon / 16S params + min_overlap = "" + max_overlap = "" + + // 10X params -System.out.println "Lecture du fichier de configuration du run : $launchDir/../params.config" -includeConfig "$launchDir/../params.config" + + // MethylSeq params + puc19 = "" + lambda = "" +} + +params.samplesheet = params.inputdir.toString() + "/SampleSheet.csv" +params.data_location = params.inputdir.toString() + "/" + params.project.toString() // Dynamic params +import java.text.SimpleDateFormat +SimpleDateFormat uniqueness_format = new SimpleDateFormat("yyyyMMddHHmmss") params { nf_uniqueness = uniqueness_format.format(new Date()) - outdir= params.inputdir + "/nextflow/" + nf_uniqueness + outdir= params.inputdir + "/nextflow/" + project + "_" + run_name + "_" + nf_uniqueness System.out.println "" - System.out.println "runName : "+runName - System.out.println "data : "+dataNature + System.out.println "run_name : "+run_name + System.out.println "data : "+data_nature System.out.println "sequencer : "+sequencer - System.out.println "machineID : "+machineID + System.out.println "machine_id : "+machine_id System.out.println "run_date : "+run_date - System.out.println "fcID : "+fcID + System.out.println "fc_id : "+fc_id System.out.println "lane : "+lane - System.out.println "demuxUniqueness : "+demuxUniqueness + System.out.println "demux_uniqueness : "+demux_uniqueness System.out.println "outdir : "+outdir System.out.println "" } @@ -54,7 +84,11 @@ process { } withName: DUPLICATED_READS { - publishDir path: "${params.outdir}/Duplicats" , mode: 'copy', pattern: "*.log" + publishDir = [ + path: "${params.outdir}/Duplicats", + mode: 'copy', + pattern: "*.log" + ] module = ['bioinfo/fastp-0.23.2'] time = { 5.h * task.attempt } memory = { 3.GB * task.attempt } @@ -75,7 +109,7 @@ process { saveAs: { filename -> "${name}.html" } ] - errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } maxRetries = 3 module = ['bioinfo/FastQC_v0.11.7'] time = { 1.h * task.attempt } @@ -127,7 +161,7 @@ process { withName: SEQTK_SAMPLE { ext.args = '-s100' - ext.args2 = 100000 + ext.args2 = params.sequencer == 'NovaSeq' ? params.nova_subset_seq : params.miseq_subset_seq module = 'bioinfo/seqtk-1.3' @@ -146,6 +180,7 @@ process { ].join(' ') module = '/tools/share/Modules/bioinfo/MultiQC-v1.11' + memory = { 5.GB * task.attempt } publishDir = [ path: { "${params.outdir}/MultiQC" }, diff --git a/conf/functions.config b/conf/functions.config new file mode 100644 index 0000000000000000000000000000000000000000..66ea1a9527ba2e46e9f93d7f44742cc53c1b93a5 --- /dev/null +++ b/conf/functions.config @@ -0,0 +1,198 @@ +def helpMessage() { + log.info""" + + Usage: + + The typical command for running the pipeline is as follows: + + nextflow run get-nf/template -profile prod -ansi-log false + + Mandatory arguments: + -profile Configuration profile to use. Can use multiple (comma separated) + Available: prod / dev. + + Options: + --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) + --contaminant Name of iGenomes // To be discussed ???? + --outdir The output directory where the results will be saved + --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --email_on_fail Same as --email, except only send mail if the workflow is not successful + --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + + + ======================================================= + Available profiles + -profile test Run the test dataset + -profile conda Build a new conda environment before running the pipeline. Use `--condaCacheDir` to define the conda cache path + -profile path Use the installation path defined for all tools. Use `--globalPath` to define the installation path + -profile docker Use the Docker images for each process + -profile singularity Use the singularity images for each process + -profile genologin Run the workflow on the cluster, instead of locally + + """.stripIndent() +} + + +def createSummary(formatted_date) { + def summary = [:] + if (workflow.revision) summary['Pipeline Release'] = workflow.revision + summary['Nextflow Run Name'] = workflow.runName + summary['Analysis Run Name'] = params.run_name + summary['Begin Date'] = formatted_date //format.format(new Date()) + summary['Sequencing Type'] = params.is_multiplex ? 'Multiplex' : 'Simplex' + summary['Reference'] = params.reference_genome ?: params.reference_transcriptome?: '' + summary['Input dir'] = params.inputdir + //summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" + if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" + summary['Launch dir'] = workflow.launchDir + summary['Working dir'] = workflow.workDir + summary['Output dir'] = params.outdir + summary['Script dir'] = workflow.projectDir + summary['User'] = workflow.userName + if (workflow.profile == 'awsbatch') { + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + } + summary['Config Profile'] = workflow.profile + if (params.email || params.email_on_fail) { + summary['E-mail Address'] = params.email + summary['E-mail on failure'] = params.email_on_fail + } + + return summary +} + +def customMailSend(body, subject, email_address) { + if (workflow.profile == 'dev') { + email_address = params.email_dev + try { + def sending = ['echo', '-e' , body ].execute() | [ 'mail', '-s', subject, email_address ].execute() + log.info "[$workflow.manifest.name] [DEV] Email sent successfully to $email_address." + mail_sent=true + } catch (all) { + log.error "[$workflow.manifest.name] [DEV] ERROR ON EMAIL SENDING TO $email_address!!" + mail_sent=false + } + } else { + try { + def mailCC = params.email_bioinfo + if (params.email_labo != "") { + mailCC = params.email_labo + ',' + params.email_bioinfo + } + def sending = ['echo', '-e' , body ].execute() | [ 'mail', '-s', subject, '-c', mailCC, email_address ].execute() + log.info "[$workflow.manifest.name] Email sent successfully ${mailCC} and ${email_address} !!" + mail_sent=true + } catch (all) { + log.error "[$workflow.manifest.name] ERROR ON EMAIL SENDING TO ${mailCC} AND ${email_address} !!" + mail_sent=false + } + } + log.info "$body" + return mail_sent +} + +def sendBeginMail(formatted_date) { + def pipeline_info = workflow.manifest.name.split('/') + def pipeline_group = pipeline_info[0] + def pipeline_project = pipeline_info[1] + def pipeline_techno = pipeline_project.split('-')[1] + + def begin_subject = "[" + pipeline_techno + "] [" + pipeline_group + "] " + params.inputdir.split('/')[-1] + def begin_email_fields = [:] + begin_email_fields['version'] = workflow.manifest.version + begin_email_fields['wfRunName'] = workflow.runName + begin_email_fields['run_name'] = params.run_name + begin_email_fields['project'] = params.project + begin_email_fields['sequencer'] = params.sequencer + begin_email_fields['flowcell'] = params.fc_id + begin_email_fields['lane'] = params.lane + begin_email_fields['data_nature'] = params.data_nature + begin_email_fields['directory'] = params.inputdir + begin_email_fields['commandLine'] = workflow.commandLine + begin_email_fields['dateStart'] = formatted_date //format.format(new Date()) + begin_email_fields['homePage'] = workflow.manifest.homePage + begin_email_fields['name'] = workflow.manifest.name + + // Render the TXT template + def begin_engine = new groovy.text.GStringTemplateEngine() + def begin_tf = new File("$baseDir/assets/begin_template.txt") + def begin_txt_template = begin_engine.createTemplate(begin_tf).make(begin_email_fields) + + def begin_email_txt = begin_txt_template.toString() + + def email_address = params.email + if (email_address && !workflow.resume) { + mail_sent = customMailSend(begin_email_txt, begin_subject, email_address) + } +} + +def sendFinalMail(formatted_date, summary) { + def pipeline_info = workflow.manifest.name.split('/') + def pipeline_group = pipeline_info[0] + def pipeline_project = pipeline_info[1] + def pipeline_techno = pipeline_project.split('-')[1] + + def subject = "[" + pipeline_techno + "] [" + pipeline_group + "] " + params.inputdir.split('/')[-1] + if (workflow.success) { + subject += " : Successful" + } else { + subject += " : FAILED" + } + + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = workflow.runName + email_fields['project'] = params.project + email_fields['run'] = params.run_name + email_fields['success'] = workflow.success + email_fields['dateComplete'] = formatted_date + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['homePage'] = workflow.manifest.homePage + email_fields['name'] = workflow.manifest.name + email_fields['summary'] = summary + + //email_fields['summary']['Date Started'] = workflow.start + email_fields['summary']['Date Completed'] = workflow.complete //format.format(new Date()) + email_fields['summary']['Pipeline script file path'] = workflow.scriptFile + email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId + if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision + if (workflow.container) email_fields['summary']['Docker image'] = workflow.container + email_fields['summary']['Nextflow Version'] = workflow.nextflow.version + email_fields['summary']['Nextflow Build'] = workflow.nextflow.build + email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + // Check if we are only sending emails on failure + email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$baseDir/assets/final_email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Send the e-mail + if (email_address) { + mail_sent = customMailSend(email_txt, subject, email_address) + } + + // Write summary e-mail HTML to a file + def output_d = new File( "${params.outdir}/pipeline_info/" ) + if (!output_d.exists()) { + output_d.mkdirs() + } + def output_tf = new File( output_d, "pipeline_report.txt" ) + output_tf.withWriter { w -> w << email_txt } + + return mail_sent +} \ No newline at end of file diff --git a/conf/report.config b/conf/report.config index 385b8ecdb3929d3811f16e47f95e11ef49ad3c39..2d40a635290928a4c9b2b54e23fc509229e9b641 100644 --- a/conf/report.config +++ b/conf/report.config @@ -23,10 +23,10 @@ dag { } manifest { - name = 'get-nextflow-ngl-bi/wf-nanopore-nf' + name = 'GeT-nextflow-NGL-Bi/wf-illumina-nf' author = 'Jules Sabban' homePage = 'https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf' - description = 'Workflow for Nanopore data quality control' + description = "Workflow for Illumina data quality control" mainScript = 'main.nf' nextflowVersion = '>=0.32.0' version = '1.0.0' diff --git a/main.nf b/main.nf index 9de84762554479b5622030acb442bff4525de209..be6b94b402a7bc43dcd9206e3d1df0998a1dbc6a 100644 --- a/main.nf +++ b/main.nf @@ -20,6 +20,12 @@ This script is based on : - the Curie institute template https://github.com/bioinfo-pf-curie/geniac-template/ */ +import java.text.SimpleDateFormat +SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss") +include {createSummary} from "$baseDir/conf/functions.config" +params.summary = createSummary(format.format(new Date())) +params.summary.collect{k,v -> println "$k : $v"} + /* ======================================================================================== diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index ca2a7bf1d19974ccd98d95489168ca3933dfa4dd..f12fd855a0bf7bfea4949d94a1732ebd24c4ee16 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -3,7 +3,7 @@ */ process extractInfoForDemuxStats { - publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' + publishDir path: "${params.outdir}/Demux" , mode: 'copy' input: path SampleSheet @@ -41,14 +41,14 @@ process demultiplexStats { process FASTQC { - tag " $name" input: tuple val(name), path(read) output: - tuple val(name), path("*_fastqc.{zip,html}") , emit: report + tuple val(name), path("*_fastqc.html") , emit: html + tuple val(name), path("*_fastqc.zip") , emit: zip // path log files script: diff --git a/modules/local/module_dna.nf b/modules/local/module_dna.nf index ea95679f3b5e8d715c75afe50cee3dd193315ae8..534950c56c001b989853bc40da213bb925f4b7b1 100644 --- a/modules/local/module_dna.nf +++ b/modules/local/module_dna.nf @@ -16,7 +16,7 @@ process BWA_ALIGNMENT { BWA_ALIGNMENT script: """ - bwa mem ${params.referenceGenome} ${reads} 1> ${sample}.sam 2> ${sample}.log + bwa mem ${params.reference_genome} ${reads} 1> ${sample}.sam 2> ${sample}.log """ } @@ -120,7 +120,7 @@ process alignmentQualityStats { tuple val(sample), path("*.png"), emit: graph script: - cigarOptions = params.splitReads ? "--readsplit" : "" + cigarOptions = params.split_reads ? "--readsplit" : "" if (params.pairedEnd) { """ diff --git a/nextflow.config b/nextflow.config index 26777bdf73c9dfca319004b47560781a7d3c6c7e..c161fa17997fbfe1fcd8a67f426cc3d1c3b53f90 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,14 +4,23 @@ // Global params params { // PARAMETRE POUR OUTILS - // TODO + + // Subset fastq files params + miseq_subset_byte = 20000000 // in byte <=> 20 000 reads + miseq_subset_seq = 20000 // in reads + nova_subset_byte = 700000000 // in byte <=> 1 000 000 reads + nova_subset_seq = 1000000 // in reads + large_indexing_nova_subset_byte = 350000000 // in byte <=> 500 000 reads + large_indexing_nova_subset_seq = 500000 // in reads // OTHERS - email="jules.sabban@inrae.fr" + email="" + email_dev="jules.sabban@inrae.fr" email_on_fail="jules.sabban@inrae.fr" email_bioinfo="get-plage.bioinfo@genotoul.fr" - email_labo="get-plage.labo@genotoul.fr" - + //email_labo="get-plage.labo@genotoul.fr" + email_labo="" + monochrome_logs = true help = false @@ -37,7 +46,7 @@ profiles { debug { process.beforeScript = 'echo $HOSTNAME' } docker { docker.enabled = true } singularity { singularity.enabled = true } - test { includeConfig "$baseDir/conf/test.config" } + dev { includeConfig "$baseDir/conf/test.config" } prod { includeConfig "$baseDir/conf/prod.config" } } diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index 9ac154556b8f31c92e624cdf4933a4c3410c59f1..f71886cb365630cb57716f27a29cdf21c3189a1c 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -71,7 +71,7 @@ workflow CORE { demultiplexStats(ch_DemuxStatXML, extractInfoForDemuxStats.out, ch_DemuxSummary) // ----------- Illumina Filter // ou SubsetSeqFiles : dans quel cas on fait l'un ou l'autre ???? - if (params.sequencer == 'NovaSeq' & params.isMultiplex) { + if (params.sequencer == 'NovaSeq' & params.is_multiplex) { System.out.println "Les données ne nécessite pas de passer par IlluminaFilter" ch_read_good = ch_read } else { // Si MiSeq ou Nova + noIndex @@ -87,9 +87,21 @@ workflow CORE { // ----------- Recherche Duplicats GUNZIP(ch_read_good) - SEQTK_SAMPLE(GUNZIP.out) - DUPLICATED_READS( - SEQTK_SAMPLE.out + + def bytes_subset_seq = params.sequencer == 'NovaSeq' ? params.nova_subset_byte : params.miseq_subset_byte + System.out.println "Seuil de taille de fichier pour subset : " + bytes_subset_seq + " bytes." + GUNZIP.out.branch{ + large : it[1].size() >= bytes_subset_seq + small : it[1].size() < bytes_subset_seq + }.set{unzip_reads_split} + + unzip_reads_split.large.count().map{it}.subscribe onNext: { println it + " large fastq" } + unzip_reads_split.small.count().map{it}.subscribe onNext: { println it + " small fastq" } + + // Do subset only on large fastq files + SEQTK_SAMPLE(unzip_reads_split.large) + DUPLICATED_READS(unzip_reads_split.small + .mix(SEQTK_SAMPLE.out) .collect{it[1]} .flatten() .map { $it -> [ ($it.simpleName =~ /(.*)_R[1-2]_.*/)[0][1] , $it ] } @@ -97,7 +109,8 @@ workflow CORE { ) // need fastq paired !!! emit: - fastqc_report = FASTQC.out.report ?: Channel.empty() + fastqc_report = FASTQC.out.zip ?: Channel.empty() fastqscreen_report = FASTQSCREEN.out.report ?: Channel.empty() fastp_report = DUPLICATED_READS.out.json + subset_fastq = unzip_reads_split.small.mix(SEQTK_SAMPLE.out) } diff --git a/sub-workflows/local/dna_qc.nf b/sub-workflows/local/dna_qc.nf index 794f7aa9e1c760842ba57577538e7c50bdea478a..4be068643d855de2e361dbc540ea480f62028e52 100644 --- a/sub-workflows/local/dna_qc.nf +++ b/sub-workflows/local/dna_qc.nf @@ -26,7 +26,7 @@ workflow DNA_QC { fastq main: - if ( "$params.referenceGenome" != '' ) { + if ( "$params.reference_genome" != '' ) { BWA_ALIGNMENT(fastq) SAMTOOLS_VIEW(BWA_ALIGNMENT.out.sam) SAMTOOLS_SORT(SAMTOOLS_VIEW.out.bam) @@ -37,6 +37,7 @@ workflow DNA_QC { flagstats_output_emitted = SAMTOOLS_FLAGSTATS.out.txt } else { + System.out.println "Le paramètre reference_genome est vide : $params.reference_genome, on ne peut pas faire d'alignement" // If Qualimap and Samtools were not executed qualimap_report_emitted = Channel.empty() flagstats_output_emitted = Channel.empty() diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index 778ec1e469895851b8ddcb7bad12d344c868d141..b0f4f1d09d7a60fff6d1f23f7a547e7022dd4b88 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -2,40 +2,12 @@ nextflow.enable.dsl = 2 -def helpMessage() { - log.info""" - - Usage: - - The typical command for running the pipeline is as follows: - - nextflow run get-nf/template -profile prod -ansi-log false - - Mandatory arguments: - -profile Configuration profile to use. Can use multiple (comma separated) - Available: prod / dev. - - Options: - --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) - --contaminant Name of iGenomes // To be discussed ???? - --outdir The output directory where the results will be saved - --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail Same as --email, except only send mail if the workflow is not successful - --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - - - ======================================================= - Available profiles - -profile test Run the test dataset - -profile conda Build a new conda environment before running the pipeline. Use `--condaCacheDir` to define the conda cache path - -profile path Use the installation path defined for all tools. Use `--globalPath` to define the installation path - -profile docker Use the Docker images for each process - -profile singularity Use the singularity images for each process - -profile genologin Run the workflow on the cluster, instead of locally - - """.stripIndent() -} +// Import custom functions +include { helpMessage; + createSummary; + sendBeginMail; + sendFinalMail +} from "$baseDir/conf/functions.config" // Show help message if (params.help) { @@ -43,6 +15,15 @@ if (params.help) { exit 0 } +// Print every non-void parameters +System.out.println "\nAffichage de tous les paramètres non vides :" +params.each{entry -> + if (entry.value != "") { + println "$entry.key:\t $entry.value" + } +} +System.out.println "\n" + // ------------------------------------------------- // CHANNELS // ------------------------------------------------- @@ -52,11 +33,11 @@ ch_DemuxStatXML=Channel.fromPath(params.inputdir+'/Stats/DemultiplexingStats.xml // fastq one by one ch_read=Channel - .fromPath(params.data+'/*_R{1,2}_*.fastq.gz') + .fromPath(params.data_location+'/*_R{1,2}_*.fastq.gz') .map{$it -> [$it.simpleName, $it]} // fastq paired -//ch_read_merged=Channel.fromFilePairs(params.data+'/*_R{1,2}_*.fastq.gz') +//ch_read_merged=Channel.fromFilePairs(params.data_location+'/*_R{1,2}_*.fastq.gz') mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 @@ -69,29 +50,34 @@ createDir = file(params.outdir).mkdir() // ------------------------------------------------- include { CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" -//include { MULTIQC } from "$baseDir/modules/local/module_reports.nf" include { MULTIQC } from "${params.shared_modules}/multiqc.nf" include { workflow_summary as WORKFLOW_SUMMARY } from "${params.shared_modules}/workflow_summary.nf" +// ------------------------------------------------- +// EMAIL ON START +// ------------------------------------------------- +import java.text.SimpleDateFormat +SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss") +sendBeginMail(format.format(new Date())) + // ------------------------------------------------- // WORKFLOW // ------------------------------------------------- workflow ILLUMINA_QC { + ch_mqc = Channel.empty() WORKFLOW_SUMMARY() CORE(ch_ss, ch_DemuxStatXML, ch_DemuxSummary, ch_read) /*ch_ngl, ch_runInfo, mismatchNumber, params.raw_data*/ - if (params.dataNature == 'DNA') { - DNA_QC(ch_read) + if (params.data_nature == 'DNA') { + DNA_QC(CORE.out.subset_fastq) + ch_mqc = ch_mqc.mix( + DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]), + DNA_QC.out.flagstats_output.collect{it[1]}.ifEmpty([]) + ) } else { System.out.println "Le QC des données non ADN n'est pas prit en charge pour le moment." - } - - // MultiQC - if ( "$params.referenceGenome" != '' ) { - System.out.println "Création de Channels vides pour les process non exécutés." - DNA_QC.out.qualimap_report = Channel.empty() - DNA_QC.out.flagstats_output = Channel.empty() + ch_mqc = ch_mqc.mix( Channel.empty() ) } MULTIQC(WORKFLOW_SUMMARY.out.ifEmpty([]) @@ -99,8 +85,7 @@ workflow ILLUMINA_QC { CORE.out.fastqc_report.collect{it[1]}.ifEmpty([]), CORE.out.fastqscreen_report.collect{it[1]}.ifEmpty([]), CORE.out.fastp_report.collect{it[1]}.ifEmpty([]), - DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]), - DNA_QC.out.flagstats_output.collect{it[1]}.ifEmpty([]) + ch_mqc.collect{it[1]}.ifEmpty([]) ).collect() ) /* @@ -116,4 +101,14 @@ workflow ILLUMINA_QC { methyl_qc sub-worflow */ -} \ No newline at end of file +} + +// ------------------------------------------------- +// EMAIL ON COMPLETE +// ------------------------------------------------- +def end_mail_sent = false +workflow.onComplete { + end_mail_sent = sendFinalMail(format.format(new Date()), params.summary) +} + +workflow.onError { } \ No newline at end of file