diff --git a/MANUAL.md b/MANUAL.md index b023baad..48f88ff4 100644 --- a/MANUAL.md +++ b/MANUAL.md @@ -335,6 +335,11 @@ be substituted with S3 links. Descriptions for creating all files can be found i version: 1.2.0 mutation_calling: + consensus: + indel_majority: -> Number of callers required to accept an indel. + Will dynamically compute the indel majority by default. + snv_majority: -> Number of callers required to accept an snv. + Will dynamically compute the snv majority by default. indexes: chromsosomes: canonical, canonical_chr, chr1, Y -> A list of chromosomes to process. This options overrides the @@ -362,8 +367,10 @@ be substituted with S3 links. Descriptions for creating all files can be found i java_Xmx: 5G -> The heap size to use for MuTect per job (i.e. per chromosome) version: 1.1.7 + run: True -> Switch to skip calling mutect muse: version: 1.0rc_submission_b391201 + run: True -> Switch to skip calling muse radia: -> Radia uses perchrom bed files in folders as references. cosmic_beds: /path/to/radia_cosmic.tar.gz @@ -372,17 +379,20 @@ be substituted with S3 links. Descriptions for creating all files can be found i pseudogene_beds: /path/to/radia_pseudogenes.tar.gz gencode_beds: /path/to/radia_gencode.tar.gz version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03 + run: True -> Switch to skip calling radia somaticsniper: version: 1.0.4 samtools: -> pileup reads version: 0.1.8 bam_readcount: -> obtain readcounts version: 0.7.4 + run: True -> Switch to skip calling somaticsniper strelka: version: 1.0.15 config_file: /path/to/strelka_config.ini.tar.gz -> The Strelka config file for a bwa run (modified for a WXS run if necessary) + run: True -> Switch to skip calling strelka star_fusion: run: True -> Switch to skip fusion calling version: 1.0.0 diff --git a/src/protect/common.py b/src/protect/common.py index 17d72b5e..1839a2a5 100644 --- a/src/protect/common.py +++ b/src/protect/common.py @@ -638,7 +638,7 @@ def email_report(job, univ_options): server = smtplib.SMTP('localhost') except socket.error as e: if e.errno == 111: - print('No mail utils on this maachine') + print('No mail utils on this machine') else: print('Unexpected error while attempting to send an email.') print('Could not send email report') diff --git a/src/protect/mutation_annotation/snpeff.py b/src/protect/mutation_annotation/snpeff.py index 43ed03c0..e3665370 100644 --- a/src/protect/mutation_annotation/snpeff.py +++ b/src/protect/mutation_annotation/snpeff.py @@ -39,6 +39,8 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): :return: fsID for the snpeffed vcf :rtype: toil.fileStore.FileID """ + if merged_mutation_file is None: + return None work_dir = os.getcwd() input_files = { 'merged_mutations.vcf': merged_mutation_file, @@ -46,7 +48,6 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} - parameters = ['eff', '-dataDir', input_files['snpeff_index'], '-c', '/'.join([input_files['snpeff_index'], diff --git a/src/protect/mutation_calling/common.py b/src/protect/mutation_calling/common.py index 947426d2..05dc8d55 100644 --- a/src/protect/mutation_calling/common.py +++ b/src/protect/mutation_calling/common.py @@ -51,27 +51,46 @@ def chromosomes_from_fai(genome_fai): return chromosomes -def run_mutation_aggregator(job, mutation_results, univ_options): +def run_mutation_aggregator(job, mutation_results, univ_options, consensus_options): """ Aggregate all the called mutations. :param dict mutation_results: Dict of dicts of the various mutation callers in a per chromosome format :param dict univ_options: Dict of universal options used by almost all tools + :param dict consensus_options: options specific for consensus mutation calling :returns: fsID for the merged mutations file :rtype: toil.fileStore.FileID """ # Setup an input data structure for the merge function out = {} - for chrom in mutation_results['mutect'].keys(): - out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results, - univ_options).rv() - merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options) - job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient']) - return merged_snvs.rv() + chroms = {} + # Extract the chromosomes from a mutation caller if at least one mutation caller is selected. All callers should + # have the same chromosomes. + for caller in mutation_results: + if mutation_results[caller] is None: + continue + else: + if caller == 'strelka': + if mutation_results['strelka']['snvs'] is None: + continue + chroms = mutation_results['strelka']['snvs'].keys() + else: + chroms = mutation_results[caller].keys() + break + if chroms: + for chrom in chroms: + out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results, + univ_options, consensus_options).rv() + merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options) + job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient']) + return merged_snvs.rv() + else: + return None + -def merge_perchrom_mutations(job, chrom, mutations, univ_options): +def merge_perchrom_mutations(job, chrom, mutations, univ_options, consensus_options): """ Merge the mutation calls for a single chromosome. @@ -79,6 +98,7 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options): :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of per chromosome job store ids for vcfs as value :param dict univ_options: Dict of universal options used by almost all tools + :param dict consensus_options: options specific for consensus mutation calling :returns fsID for vcf contaning merged calls for the given chromosome :rtype: toil.fileStore.FileID """ @@ -91,39 +111,40 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options): mutations.pop('indels') mutations['strelka_indels'] = mutations['strelka']['indels'] mutations['strelka_snvs'] = mutations['strelka']['snvs'] - vcf_processor = {'snvs': {'mutect': process_mutect_vcf, + vcf_processor = {'snv': {'mutect': process_mutect_vcf, 'muse': process_muse_vcf, 'radia': process_radia_vcf, 'somaticsniper': process_somaticsniper_vcf, 'strelka_snvs': process_strelka_vcf }, - 'indels': {'strelka_indels': process_strelka_vcf + 'indel': {'strelka_indels': process_strelka_vcf } } - # 'fusions': lambda x: None, - # 'indels': lambda x: None} - # For now, let's just say 2 out of n need to call it. - # num_preds = len(mutations) - # majority = int((num_preds + 0.5) / 2) - majority = {'snvs': 2, - 'indels': 1} - accepted_hits = defaultdict(dict) - for mut_type in vcf_processor.keys(): # Get input files perchrom_mutations = {caller: vcf_processor[mut_type][caller](job, mutations[caller][chrom], work_dir, univ_options) - for caller in vcf_processor[mut_type]} + for caller in vcf_processor[mut_type] + if mutations[caller] is not None} + if not perchrom_mutations: + continue # Process the strelka key - perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type] - perchrom_mutations.pop('strelka_' + mut_type) + if 'strelka_' + mut_type in perchrom_mutations: + perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type] + perchrom_mutations.pop('strelka_' + mut_type) + if consensus_options[mut_type + '_majority'] is not None: + majority = consensus_options[mut_type + '_majority'] + elif len(perchrom_mutations) <= 2: + majority = 1 + else: + majority = (len(perchrom_mutations) + 1) / 2 # Read in each file to a dict vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()} all_positions = list(set(itertools.chain(*vcf_lists.values()))) for position in sorted(all_positions): hits = {caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys()} - if sum(hits.values()) >= majority[mut_type]: + if sum(hits.values()) >= majority: callers = ','.join([caller for caller, hit in hits.items() if hit]) assert position[1] not in accepted_hits[position[0]] accepted_hits[position[0]][position[1]] = (position[2], position[3], callers) @@ -157,7 +178,7 @@ def read_vcf(vcf_file): if line.startswith('#'): continue line = line.strip().split() - vcf_dict.append((line[0], line[1], line[3], line[4])) + vcf_dict.append((line[0], line[1], line[3].upper(), line[4].upper())) return vcf_dict diff --git a/src/protect/mutation_calling/indel.py b/src/protect/mutation_calling/indel.py index 7c8990da..81c3da6a 100644 --- a/src/protect/mutation_calling/indel.py +++ b/src/protect/mutation_calling/indel.py @@ -27,7 +27,9 @@ def run_indel_caller(job, tumor_bam, normal_bam, univ_options, indel_options): :return: fsID to the merged fusion calls :rtype: toil.fileStore.FileID """ - job.fileStore.logToMaster('INDELs are currently unsupported.... Skipping.') + job.fileStore.logToMaster('INDELs currently occur only through strelka.') + if indel_options['run'] is False: + return None indel_file = job.fileStore.getLocalTempFile() output_file = job.fileStore.writeGlobalFile(indel_file) job.fileStore.logToMaster('Ran INDEL on %s successfully' % univ_options['patient']) diff --git a/src/protect/mutation_calling/muse.py b/src/protect/mutation_calling/muse.py index 28be1c6b..0617c41f 100644 --- a/src/protect/mutation_calling/muse.py +++ b/src/protect/mutation_calling/muse.py @@ -78,6 +78,8 @@ def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options): +- 'chrM': fsID :rtype: dict """ + if muse_options['run'] is False: + return None # Get a list of chromosomes to handle if muse_options['chromosomes']: chromosomes = muse_options['chromosomes'] diff --git a/src/protect/mutation_calling/mutect.py b/src/protect/mutation_calling/mutect.py index eb1a556f..56c385c2 100644 --- a/src/protect/mutation_calling/mutect.py +++ b/src/protect/mutation_calling/mutect.py @@ -75,6 +75,8 @@ def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options): +- 'chrM': fsID :rtype: dict """ + if mutect_options['run'] is False: + return None # Get a list of chromosomes to handle if mutect_options['chromosomes']: chromosomes = mutect_options['chromosomes'] diff --git a/src/protect/mutation_calling/radia.py b/src/protect/mutation_calling/radia.py index 178bd8dd..6d4c95cd 100644 --- a/src/protect/mutation_calling/radia.py +++ b/src/protect/mutation_calling/radia.py @@ -87,6 +87,8 @@ def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options): +- 'chrM': fsID :rtype: dict """ + if radia_options['run'] is False: + return None if 'rna_genome' in rna_bam.keys(): rna_bam = rna_bam['rna_genome'] elif set(rna_bam.keys()) == {'rna_genome_sorted.bam', 'rna_genome_sorted.bam.bai'}: diff --git a/src/protect/mutation_calling/somaticsniper.py b/src/protect/mutation_calling/somaticsniper.py index ae148f04..ba903253 100644 --- a/src/protect/mutation_calling/somaticsniper.py +++ b/src/protect/mutation_calling/somaticsniper.py @@ -85,6 +85,8 @@ def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_op +- 'chrM': fsID :rtype: toil.fileStore.FileID|dict """ + if somaticsniper_options['run'] is False: + return None # Get a list of chromosomes to handle if somaticsniper_options['chromosomes']: chromosomes = somaticsniper_options['chromosomes'] diff --git a/src/protect/mutation_calling/strelka.py b/src/protect/mutation_calling/strelka.py index 0a1a0e7b..708b6cb1 100644 --- a/src/protect/mutation_calling/strelka.py +++ b/src/protect/mutation_calling/strelka.py @@ -45,7 +45,10 @@ def run_strelka_with_merge(job, tumor_bam, normal_bam, univ_options, strelka_opt :param dict univ_options: Dict of universal options used by almost all tools :param dict strelka_options: Options specific to strelka :return: fsID to the merged strelka calls - :rtype: toil.fileStore.FileID + :rtype: dict(str, toil.fileStore.FileID) + |-'snvs': fsID + | + +-'indels': fsID """ spawn = job.wrapJobFn(run_strelka, tumor_bam, normal_bam, univ_options, strelka_options, split=False).encapsulate() @@ -63,8 +66,9 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split :param dict univ_options: Dict of universal options used by almost all tools :param dict strelka_options: Options specific to strelka :param bool split: Should the results be split into perchrom vcfs? - :return: Either the fsID to the genome-level vcf or a dict of results from running strelka - on every chromosome + :return: Either the a dict of results from running strelka on every chromosome, or a dict of running strelka on the + whole genome + perchrom_strelka: |- 'chr1': | |-'snvs': fsID @@ -77,8 +81,16 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split +- 'chrM': |-'snvs': fsID +-'indels': fsID - :rtype: toil.fileStore.FileID|dict + + genome_strelka: + |-'snvs': fsID + | + +-'indels': fsID + + :rtype: dict """ + if strelka_options['run'] is False: + return {'snvs': None, 'indels': None} if strelka_options['chromosomes']: chromosomes = strelka_options['chromosomes'] else: diff --git a/src/protect/pipeline/ProTECT.py b/src/protect/pipeline/ProTECT.py index 6b40fb79..b8162fe7 100644 --- a/src/protect/pipeline/ProTECT.py +++ b/src/protect/pipeline/ProTECT.py @@ -354,6 +354,8 @@ def _parse_config_file(job, config_file, max_cores=None): # Flags to check for presence of encryption keys if required gdc_inputs = ssec_encrypted = False + # Flag to check if a sample without an input vcf/bedpe was provided + sample_without_variants = False for key in input_config.keys(): if key == 'patients': # Ensure each patient contains the required entries @@ -373,6 +375,9 @@ def _parse_config_file(job, config_file, max_cores=None): raise ParameterError('Cannot run ProTECT using GDC RNA bams. Please fix ' 'sample %s' % sample_name) gdc_inputs = True + if ('mutation_vcf' not in sample_set[sample_name] + and 'fusion_bedpe' not in sample_set[sample_name]): + sample_without_variants = True else: # Ensure the required entries exist for this key _ensure_set_contains(input_config[key], required_keys[key], key) @@ -427,6 +432,27 @@ def _parse_config_file(job, config_file, max_cores=None): 'token.') # Get all the tool inputs job.fileStore.logToMaster('Obtaining tool inputs') + if (sample_without_variants and all(tool_options[k]['run'] is False + for k in mutation_caller_list + if k not in ('indexes', 'fusion_inspector', 'consensus'))): + raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.") + + for mut_type in ('snv', 'indel'): + if tool_options['consensus'][mut_type + '_majority'] is not None: + if not isinstance(tool_options['consensus'][mut_type + '_majority'], int): + raise RuntimeError('Majorities have to be integers. Got %s for %s_majority.' % + (tool_options['consensus'][mut_type + '_majority'], mut_type)) + if mut_type == 'snv': + count = sum(tool_options[k]['run'] is True + for k in ('muse', 'mutect', 'somaticsniper', 'radia', 'strelka')) + else: + count = 1 if tool_options['strelka']['run'] is True else 0 + if tool_options['consensus'][mut_type + '_majority'] > count: + raise RuntimeError('Majority cannot be greater than the number of callers. ' + 'Got number of %s callers = %s majority = %s.' % + (mut_type, count, + tool_options['consensus'][mut_type + '_majority'])) + process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options, mutation_caller_list=mutation_caller_list) job.fileStore.logToMaster('Obtained tool inputs') @@ -670,7 +696,7 @@ def launch_protect(job, patient_data, univ_options, tool_options): bam_files['normal_dna'].rv(), univ_options, tool_options['strelka']).encapsulate(), 'indels': job.wrapJobFn(run_indel_caller, bam_files['tumor_dna'].rv(), - bam_files['normal_dna'].rv(), univ_options, 'indel_options', + bam_files['normal_dna'].rv(), univ_options, {'run': False}, disk='100M', memory='100M', cores=1)} for sample_type in 'tumor_dna', 'normal_dna': for caller in mutations: @@ -678,7 +704,7 @@ def launch_protect(job, patient_data, univ_options, tool_options): bam_files['tumor_rna'].addChild(mutations['radia']) get_mutations = job.wrapJobFn(run_mutation_aggregator, {caller: cjob.rv() for caller, cjob in mutations.items()}, - univ_options, disk='100M', memory='100M', + univ_options, tool_options['consensus'], disk='100M', memory='100M', cores=1).encapsulate() for caller in mutations: mutations[caller].addChild(get_mutations) @@ -768,7 +794,7 @@ def get_all_tool_inputs(job, tools, outer_key='', mutation_caller_list=None): indexes = tools.pop('indexes') indexes['chromosomes'] = parse_chromosome_string(job, indexes['chromosomes']) for mutation_caller in mutation_caller_list: - if mutation_caller == 'indexes': + if mutation_caller in ('indexes', 'consensus'): continue tools[mutation_caller].update(indexes) return tools diff --git a/src/protect/pipeline/defaults.yaml b/src/protect/pipeline/defaults.yaml index 145d2fa4..bd2bf5de 100644 --- a/src/protect/pipeline/defaults.yaml +++ b/src/protect/pipeline/defaults.yaml @@ -55,22 +55,30 @@ expression_estimation: version: 1.2.20 mutation_calling: + consensus: + indel_majority: + snv_majority: indexes: chromosomes: mutect: java_Xmx: 2G version: 1.1.7 + run: True muse: version: 1.0rc_submission_b391201 + run: True radia: version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03 + run: True somaticsniper: + run: True version: 1.0.4 samtools: version: 0.1.8 bam_readcount: version: 0.7.4 strelka: + run: True version: 1.0.15 star_fusion: diff --git a/src/protect/pipeline/input_parameters.yaml b/src/protect/pipeline/input_parameters.yaml index 5dc5943e..e98182b6 100644 --- a/src/protect/pipeline/input_parameters.yaml +++ b/src/protect/pipeline/input_parameters.yaml @@ -83,6 +83,9 @@ expression_estimation: # version: 1.2.0 mutation_calling: + consensus: + indel_majority: 1 + snv_majority: 2 indexes: chromosomes: canonical_chr, chrM genome_fasta: S3://protect-data/hg38_references/hg38.fa.tar.gz @@ -95,8 +98,10 @@ mutation_calling: dbsnp_tbi: S3://protect-data/hg38_references/dbsnp_coding.vcf.gz.tbi mutect: java_Xmx: 2G + # run: True # version: 1.1.7 muse: + # run: True # version: 1.0rc_submission_b391201 radia: cosmic_beds: S3://protect-data/hg38_references/radia_cosmic.tar.gz @@ -104,8 +109,10 @@ mutation_calling: retrogene_beds: S3://protect-data/hg38_references/radia_retrogenes.tar.gz pseudogene_beds: S3://protect-data/hg38_references/radia_pseudogenes.tar.gz gencode_beds: S3://protect-data/hg38_references/radia_gencode.tar.gz + # run: True # version: 398366ef07b5911d8082ed61cbf03d487a41f286 somaticsniper: + # run: True # version: 1.0.4 samtools: # version: 0.1.8 @@ -118,6 +125,7 @@ mutation_calling: #run_trinity: True #version: 1.0.1 strelka: + # run: True # version: 1.0.15 config_file: S3://protect-data/hg38_references/strelka_bwa_WXS_config.ini.tar.gz