diff --git a/README.md b/README.md index 31a795dabcf997bd5a16dcea13d3464a32f25e1a..f243e25ed21dea577ab92273f6451119dc58c432 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Many of these steps are optional and their necessity depends on the desired anal * controls the quality of raw and cleaned data ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) * makes a taxonomic classification of cleaned reads ([Kaiju MEM](https://github.com/bioinformatics-centre/kaiju) + [kronaTools](https://github.com/marbl/Krona/wiki/KronaTools) + [plot_kaiju_stat.py](bin/plot_kaiju_stat.py) + [merge_kaiju_results.py](bin/merge_kaiju_results.py)) * `S02_ASSEMBLY` - * assembles reads ([metaSPAdes](https://github.com/ablab/spades) or [Megahit](https://github.com/voutcn/megahit) or [Hifiasm_meta](https://github.com/lh3/hifiasm-meta), [metaFlye](https://github.com/fenderglass/Flye) or [metaMDBG](https://github.com/GaetanBenoitDev/metaMDBG)) + * assembles reads ([metaSPAdes](https://github.com/ablab/spades) or [Megahit](https://github.com/voutcn/megahit) or [Hifiasm_meta](https://github.com/lh3/hifiasm-meta), [metaFlye](https://github.com/fenderglass/Flye)) * assesses the quality of assembly ([metaQUAST](http://quast.sourceforge.net/metaquast)) * reads deduplication, alignment against contigs for short reads ([BWA-MEM2](https://github.com/bwa-mem2/bwa-mem2) + [Samtools](http://www.htslib.org/)) * reads alignment against contigs for HiFi reads ([Minimap2](https://github.com/lh3/minimap2) + [Samtools](http://www.htslib.org/)) diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py index fdea68d1d96b150bc8f2a09d7c328819055392be..7ec0b9d53c541722f456b1ba87cf8391a26a31b1 100755 --- a/bin/filter_contig_per_cpm.py +++ b/bin/filter_contig_per_cpm.py @@ -25,7 +25,6 @@ __status__ = 'dev' from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter import pandas as pd -import numpy as np import logging import pyfastx @@ -35,7 +34,7 @@ import pyfastx def parse_arguments(): """Parse script arguments.""" - parser = ArgumentParser(description="...", + parser = ArgumentParser(description="Calculate CPM per contig", formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("-i", "--samtools_idxstats", nargs='+', required = True, @@ -59,36 +58,71 @@ def parse_arguments(): args = parser.parse_args() return args -def combine_idxstat_files(idxstat_files): +def combine_idxstat_files(idxstat_dfs): """ - Combine multiple idxstat files that have the same contigs. + Combine multiple idxstat df that have the same contigs. Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences. """ + + common_contigs = set(idxstat_dfs[0].index) + for df in idxstat_dfs[1:]: + common_contigs = common_contigs.union(set(df.index)) + + # Convert set to list for pandas indexing + common_contigs_list = sorted(list(common_contigs)) + + # Filter dataframes to keep only common contigs + filtered_dfs = [df.loc[common_contigs_list] for df in idxstat_dfs] + + # Combine dataframes + combined_df = filtered_dfs[0].copy() + + for df in filtered_dfs[1:]: + combined_df['#_mapped_read_segments'] += df['#_mapped_read_segments'] + combined_df['cpm_count'] += df['cpm_count'] + + return combined_df + +def combine_kept_contigs(kept_contigs_list): + """Combine multiple lists of kept contigs.""" + # Get contigs that appear in all lists (union) + common_contigs = set(kept_contigs_list[0]) + for contigs in kept_contigs_list[1:]: + common_contigs = common_contigs.union(set(contigs)) + return pd.Index(list(common_contigs)) + +def filter_cpm(idxstat_file, cpm_cutoff): + """ + Calculate and filter byCPM for each contig. + """ columns_names = ['reference_sequence_name', 'sequence_length', - '#_mapped_read_segments', + '#_mapped_read_segments', '#_unmapped_read-segments'] - idxstat_df = pd.read_csv(idxstat_files[0], - sep ='\t', + idxstat_df = pd.read_csv(idxstat_file, sep ='\t', names = columns_names, usecols = ['reference_sequence_name', 'sequence_length', '#_mapped_read_segments',], comment="*").set_index('reference_sequence_name') - - for idxstat_file in idxstat_files[1:]: - other_idxstat_df = pd.read_csv(idxstat_file, - sep ='\t', - names = columns_names, - usecols = ['reference_sequence_name', - '#_mapped_read_segments',], - comment="*").set_index('reference_sequence_name') - - idxstat_df['#_mapped_read_segments'] += other_idxstat_df['#_mapped_read_segments'] + + + logging.info(f'idxstat_file: {idxstat_df}') + + + sum_reads = idxstat_df['#_mapped_read_segments'].sum() + idxstat_df['cpm_count'] = 1e6 * (idxstat_df['#_mapped_read_segments'] / sum_reads) + + # Apply CPM cutoff + kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index + + logging.info(f'length of idxstat_file: {len(idxstat_df)}') + logging.info(f'length of kept_contigs: {len(kept_contigs)}') + + return kept_contigs, idxstat_df - return idxstat_df def main(): args = parse_arguments() @@ -101,27 +135,31 @@ def main(): logging.basicConfig(format="%(levelname)s: %(message)s") cpm_cutoff = float(args.cutoff_cpm) - - # Read input tables - idxstat_df = combine_idxstat_files(args.samtools_idxstats) - - # Calculates cpm for each contig - sum_reads = idxstat_df['#_mapped_read_segments'].sum() - logging.info(f'Total number of mapped reads {sum_reads}') - - logging.info(f'With a cpm cutoff of {args.cutoff_cpm}, contigs with less than {(sum_reads*cpm_cutoff)/1e6} reads are removed.') - idxstat_df['cpm_count'] = 1e6 * idxstat_df['#_mapped_read_segments']/sum_reads - - # Contigs with nb reads > cutoff - kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index - - logging.info(f'{len(kept_contigs)}/{len(idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.') + #Â logging.info(f'idxstat file: {idxstat_file}') + idxstat_results = [filter_cpm(idxstat_file, cpm_cutoff) for idxstat_file in args.samtools_idxstats] + + # Separate idxstat and kept_contigs + kept_contigs, idxstat_dfs = zip(*idxstat_results) + + logging.info(f'number of kept_contigs: {len(kept_contigs)}') + logging.info(f'number of idxstat_file: {len(idxstat_dfs)}') + + # Combine idxstat dataframes + combined_idxstat_df = combine_idxstat_files(idxstat_dfs) + logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}') + + combined_kept_contigs = combine_kept_contigs(kept_contigs) + combined_kept_contigs = set(combined_kept_contigs.tolist()) + logging.info(f'length of combined_kept_contigs: {len(combined_kept_contigs)}') + + logging.info(f'{len(combined_kept_contigs)}/{len(combined_idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.') + # Write new fasta files with kept and unkept contigs with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle: for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False): - if contig in kept_contigs: + if contig in combined_kept_contigs: out_select_handle.write(f'>{contig}\n{seq}\n') else: out_discard_handle.write(f'>{contig}\n{seq}\n') diff --git a/conf/base.config b/conf/base.config index aa3d2cd07f40da0c956e9bf814171d0562b0e78f..061d014bba48b8aef1c4a273144a3d3764a52d2d 100644 --- a/conf/base.config +++ b/conf/base.config @@ -17,6 +17,7 @@ process { maxErrors = '-1' withName: CUTADAPT { + container = 'quay.io/biocontainers/cutadapt:5.0--py310h1fe012e_0' cpus = 8 memory = { 8.GB * task.attempt } } @@ -28,6 +29,7 @@ process { memory = { 8.GB * task.attempt } } withName: MULTIQC { + container = 'quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0' memory = { 8.GB * task.attempt } } withName: HOST_FILTER { @@ -105,6 +107,7 @@ process { cpus = 10 } withName: GTDBTK { + container = 'quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_2' memory = { 64.GB * task.attempt } cpus = 16 } diff --git a/docs/source/output.md b/docs/source/output.md index cbea201d116846e7a0d16a68fff4ef0051421bf3..e9ca730b37905edba7a7e9ee95f2de95df68b30c 100644 --- a/docs/source/output.md +++ b/docs/source/output.md @@ -53,7 +53,7 @@ The `results/` directory contains a sub-directory for each step launched: ```{note} In this directory you have either the results of assembly with `metaspades` or `megahit` -if you analyse short read data and `hifiasm-meta`, `metaflye` or `metamdbg` +if you analyse short read data and `hifiasm-meta` or `metaflye`. if you analyse HiFi data. You have chosen your assembly tool with `--assembly` parameter. ``` diff --git a/docs/source/usage.md b/docs/source/usage.md index c49c5b200eeed38f40055b20e5d56e4a8d795314..5aae59f8d64e0a42264da7126874285b2c4e6ede 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -351,7 +351,7 @@ See [II. Input files](usage.md#ii-input-files) and warnings. | Parameter | Description | | ------------ | ----------- | -| `--assembly` | allows to indicate the assembly tool.<br /> For short reads: `["metaspades" or "megahit"]`: Default: `metaspades`.<br /> For HiFi reads: `["hifiasm-meta", "metaflye", "metamdbg"]`. Default: `hifiasm-meta`.| +| `--assembly` | allows to indicate the assembly tool.<br /> For short reads: `["metaspades" or "megahit"]`: Default: `metaspades`.<br /> For HiFi reads: `["hifiasm-meta", "metaflye"]`. Default: `hifiasm-meta`.| | `--coassembly` | allows to assemble together the samples labeled with the same group in the samplesheet. <br /> It will generate one assembly for each group. To co-assemble all of your samples together, <br /> you must indicate a unique group for each sample in the samplesheet.| ```{warning} @@ -362,7 +362,7 @@ for each group co-assembled and automatically mapping with every sample of his g * For short reads, the user can choose between `metaspades` or `megahit` for `--assembly` parameter. The choice can be based on CPUs and memory availability: `metaspades` needs more CPUs and memory than `megahit` but our tests showed that assembly metrics are better for `metaspades` than `megahit`. For PacBio HiFi reads, the user -can choose between `hifiasm-meta`, `metaflye` or `metamdbg`. +can choose between `hifiasm-meta` or `metaflye`. ``` ```{note} diff --git a/env/binning.yml b/env/binning.yml index 77d3f73423e6e3cc1c14a65f5d97b37535f0c762..45a2405cd04522b9be88220d22370ab00762161d 100644 --- a/env/binning.yml +++ b/env/binning.yml @@ -26,6 +26,5 @@ dependencies: - metabat2=2.15 - maxbin2=2.2.7 - concoct=1.1.0 - - gtdbtk=2.1 - drep=3.0.0 - pprodigal diff --git a/env/metagWGS.yml b/env/metagWGS.yml index 463510dd7f7935b58f2d2ee2db86c44b2aa9d369..cb02fd831104c8f980fa01af49a32857bb7f06c6 100644 --- a/env/metagWGS.yml +++ b/env/metagWGS.yml @@ -8,7 +8,6 @@ dependencies: - bcbio-gff=0.6.9 - bwa-mem2=2.2.1 - cd-hit=4.8.1 - - cutadapt=4.2 - diamond=2.0.15 - eggnog-mapper=2.1.9 - fastqc=0.11.9 @@ -19,7 +18,6 @@ dependencies: - krona=2.8.1 - megahit=1.2.9 - minimap2=2.24 - - multiqc=1.14 - pandas=1.5.2 - plotly - prodigal=2.6.3 diff --git a/modules/get_software_versions.nf b/modules/get_software_versions.nf index a60f2174a31f01a32b0820d864d3ee2cbc8321ff..97cbef9430efb71648db97d80242b4e0750d2d1d 100644 --- a/modules/get_software_versions.nf +++ b/modules/get_software_versions.nf @@ -17,7 +17,6 @@ process GET_SOFTWARE_VERSIONS { echo $workflow.manifest.version > v_pipeline.txt echo $workflow.nextflow.version > v_nextflow.txt python --version &> v_python.txt - multiqc --version &> v_multiqc.txt scrape_software_versions.py > software_versions_mqc.yaml """ } \ No newline at end of file diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf index 73499bae789f36cba129a85fc58459f09374c271..ba9b86decd7a408a9d740b5f23254d7e5c2b53bf 100644 --- a/modules/gtdbtk.nf +++ b/modules/gtdbtk.nf @@ -15,7 +15,7 @@ process GTDBTK { export GTDBTK_DATA_PATH=$gtdbtk_db - gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --pplacer_cpus ${task.cpus} --cpus ${task.cpus} + gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --skip_ani_screen --pplacer_cpus ${task.cpus} --cpus ${task.cpus} echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt """ } \ No newline at end of file diff --git a/modules/multiqc.nf b/modules/multiqc.nf index d4581ade489f255926ba3abdc8cd6f3299a60d92..68b88160af8106134c262bd92cca05676b3dbc78 100644 --- a/modules/multiqc.nf +++ b/modules/multiqc.nf @@ -26,7 +26,7 @@ process MULTIQC { script: """ - multiqc . --config ${multiqc_config} -m custom_content -m fastqc -m cutadapt -m sickle -m kaiju -m quast -m prokka -m featureCounts -m samtools + multiqc . --config ${multiqc_config} -m custom_content -m fastqc -m cutadapt -m sickle -m kaiju -m quast -m prokka -m featurecounts -m samtools multiqc --version > v_multiqc.txt """ }