diff --git a/qiime/workflow/pick_open_reference_otus.py b/qiime/workflow/pick_open_reference_otus.py index a28532bea2..080c06c0ed 100644 --- a/qiime/workflow/pick_open_reference_otus.py +++ b/qiime/workflow/pick_open_reference_otus.py @@ -10,6 +10,8 @@ __maintainer__ = "Greg Caporaso" __email__ = "gregcaporaso@gmail.com" +from itertools import izip +from os import remove from os.path import split, splitext, getsize, exists, abspath, join from shutil import copyfile, rmtree from numpy import inf @@ -1091,3 +1093,220 @@ def pick_subsampled_open_reference_otus(input_fp, if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp) + + +def convergent_pick_subsampled_open_reference_otus( + input_fps, + refseqs_fp, + output_dir, + percent_subsample, + new_ref_set_id, + command_handler, + params, + qiime_config, + prefilter_refseqs_fp=None, + prefilter_percent_id=None, + min_otu_size=2, + run_assign_tax=True, + run_align_and_tree=True, + step1_otu_map_fp=None, + step1_failures_fasta_fp=None, + parallel=False, + suppress_step4=False, + logger=None, + suppress_md5=False, + denovo_otu_picking_method='uclust', + reference_otu_picking_method='uclust_ref', + status_update_callback=print_to_stdout, + minimum_failure_threshold=100000, + num_seqs=10000): + """Call the convergent open reference workflow""" + create_dir(output_dir) + commands = [] + + if logger is None: + logger = WorkflowLogger(generate_log_fp(output_dir), params=params, + qiime_config=qiime_config) + close_logger_on_success = True + else: + close_logger_on_success = False + + # if the user has not passed a different reference collection for the + # pre-filter, used the input refseqs_fp for all iterations. we want to + # pre-filter all data against the input data as lower percent identity + # searches with uclust can be slow, so we want the reference collection to + # stay at a reasonable size. + if prefilter_refseqs_fp is None: + prefilter_refseqs_fp = refseqs_fp + + otu_table_fps = [] + repset_fasta_fps = [] + + # Construct an array of sequence generators + seq_generators = [parse_fasta(open(fp, 'U')) for fp in input_fps] + + iteration = 0 + + # Iterate until all seqs are clustered + while len(seq_generators) > 0: + iteration_output_dir = '%s/%d/' % (output_dir, iteration) + iter_input_fp = join(output_dir, "iter_%s_seqs.fna" % iteration) + + next_iter_seq_gen = [] + + # Write the sequences for this iteration + with open(iter_input_fp, 'w') as f: + + # Loop through all the input sequences + for seq_gen in seq_generators: + + # We need to write 'num_seqs' from each input + count_added = 0 + for i, e in izip(range(num_seqs), seq_gen): + f.write(">%s\n%s\n" % e) + count_added += 1 + + # More sequences still in this generator + if count_added == num_seqs: + next_iter_seq_gen.append(seq_gen) + + # At this point we have already created the new input file with + # <= num_seqs * len(seq_generator) sequences for this iteration, + # so we can run pick_subsampled_open_reference_otus + pick_subsampled_open_reference_otus( + input_fp=iter_input_fp, + refseqs_fp=refseqs_fp, + output_dir=iteration_output_dir, + percent_subsample=percent_subsample, + new_ref_set_id='%s.Iter.%d' % (new_ref_set_id, iteration), + command_handler=command_handler, + params=params, + qiime_config=qiime_config, + run_assign_tax=False, + run_align_and_tree=False, + prefilter_refseqs_fp=prefilter_refseqs_fp, + prefilter_percent_id=prefilter_percent_id, + min_otu_size=min_otu_size, + step1_otu_map_fp=step1_otu_map_fp, + step1_failures_fasta_fp=step1_failures_fasta_fp, + parallel=parallel, + suppress_step4=suppress_step4, + logger=logger, + suppress_md5=suppress_md5, + suppress_index_page=True, + denovo_otu_picking_method=denovo_otu_picking_method, + reference_otu_picking_method=reference_otu_picking_method, + status_update_callback=status_update_callback, + minimum_failure_threshold=minimum_failure_threshold) + + # Perform post-iteration file shuffling whether the previous + # iteration's data previously existed or was just computed + + # step1 otu map and failures can only be used for the first + # iteration as subsequent iterations need to use updated refseqs + # files + step1_otu_map_fp = step1_failures_fasta_fp = None + new_refseqs_fp = join(iteration_output_dir, 'new_refseqs.fna') + refseqs_fp = new_refseqs_fp + + otu_table_fps.append( + join(iteration_output_dir, + 'otu_table_mc%d.biom' % min_otu_size)) + repset_fasta_fps.append( + join(iteration_output_dir, 'rep_set.fna')) + + # Remove the iteration input file to avoid FS usage + remove(iter_input_fp) + + iteration += 1 + seq_generators = next_iter_seq_gen + + # The open reference execution finished. Do some post processing to get + # the final results + otu_table_fp = join(output_dir, 'otu_table_mc%s.biom' % min_otu_size) + merge_cmd = ('merge_otu_tables.py -i %s -o %s' + % (','.join(otu_table_fps), otu_table_fp)) + commands.append([("Merge OTU tables", merge_cmd)]) + + # Build master rep set + final_repset_fp = join(output_dir, 'rep_set.fna') + final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) + + command_handler(commands, status_update_callback, logger=logger, + close_logger_on_success=False) + commands = [] + + # Initialize output file names - these differ based on what combination of + # taxonomy assignment and alignment/tree building is happening + if run_assign_tax and run_align_and_tree: + tax_input_otu_table_fp = otu_table_fp + otu_table_w_tax_fp = join( + output_dir, 'otu_table_mc%d_w_tax.biom' % min_otu_size) + align_and_tree_input_otu_table = otu_table_w_tax_fp + pynast_failure_filtered_otu_table_fp = join( + output_dir, + 'otu_table_mc%d_w_tax_no_pynast_failures.biom' % min_otu_size) + elif run_assign_tax: + tax_input_otu_table_fp = otu_table_fp + otu_table_w_tax_fp = join(output_dir, + 'otu_table_mc%d_w_tax.biom' % min_otu_size) + elif run_align_and_tree: + align_and_tree_input_otu_table = otu_table_fp + pynast_failure_filtered_otu_table_fp = join( + output_dir, + 'otu_table_mc%d_no_pynast_failures.biom' % min_otu_size) + + if run_assign_tax: + # remove files from partially completed runs + remove_files([otu_table_w_tax_fp], error_on_missing=False) + + taxonomy_fp = assign_tax( + repset_fasta_fp=final_repset_fp, + output_dir=output_dir, + command_handler=command_handler, + params=params, + qiime_config=qiime_config, + parallel=parallel, + logger=logger, + status_update_callback=status_update_callback) + + # Add taxa to OTU table + add_metadata_cmd = ( + 'biom add-metadata -i %s --observation-metadata-fp %s -o %s ' + '--sc-separated taxonomy --observation-header OTUID,taxonomy' + % (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)) + commands.append([("Add taxa to OTU table", add_metadata_cmd)]) + + command_handler(commands, status_update_callback, logger=logger, + close_logger_on_success=False) + commands = [] + + if run_align_and_tree: + # remove files form partially completed runs + remove_files([pynast_failure_filtered_otu_table_fp], + error_on_missing=False) + + pynast_failures_fp = align_and_tree( + repset_fasta_fp=final_repset_fp, + output_dir=output_dir, + command_handler=command_handler, + params=params, + qiime_config=qiime_config, + parallel=parallel, + logger=logger, + status_update_callback=status_update_callback) + + # Build OTU table without pynast failures + table = load_table(align_and_tree_input_otu_table) + filtered_otu_table = filter_otus_from_otu_table( + table, + get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), + 0, inf, 0, inf, negate_ids_to_keep=True) + write_biom_table(filtered_otu_table, + pynast_failure_filtered_otu_table_fp) + + command_handler(commands, status_update_callback, logger=logger, + close_logger_on_success=False) + commands = [] + + logger.close() diff --git a/scripts/pick_open_reference_otus.py b/scripts/pick_open_reference_otus.py index 049588b9f6..b83f592d2f 100755 --- a/scripts/pick_open_reference_otus.py +++ b/scripts/pick_open_reference_otus.py @@ -19,7 +19,8 @@ call_commands_serially, print_commands, no_status_updates, print_to_stdout) from qiime.workflow.pick_open_reference_otus import ( pick_subsampled_open_reference_otus, - iterative_pick_subsampled_open_reference_otus) + iterative_pick_subsampled_open_reference_otus, + convergent_pick_subsampled_open_reference_otus) qiime_config = load_qiime_config() options_lookup = get_options_lookup() @@ -215,6 +216,14 @@ "-i $PWD/seqs1.fna,$PWD/seqs2.fna -r $PWD/refseqs.fna -o $PWD/ucrss_iter/ " "-s 0.1 -p $PWD/ucrss_params.txt")) +script_info['script_usage'].append(("", "Run the subsampled open-reference " + "OTU picking workflow in convergent iterative mode on seqs1.fna and seqs2.fna using " + "refseqs.fna as the initial reference collection. ALWAYS SPECIFY ABSOLUTE " + "FILE PATHS (absolute path represented here as $PWD, but will generally " + "look something like /home/ubuntu/my_analysis/", "%prog " + "-i $PWD/seqs1.fna,$PWD/seqs2.fna -r $PWD/refseqs.fna -o $PWD/ucrss_iterc/ " + "-s 0.1 -p $PWD/ucrss_params.txt --convergent")) + script_info['script_usage'].append(("", "Run the subsampled open-reference " "OTU picking workflow in iterative mode on seqs1.fna and seqs2.fna using " "refseqs.fna as the initial reference collection. This is useful if " @@ -322,7 +331,16 @@ make_option('--suppress_align_and_tree', action='store_true', default=False, help='skip the sequence alignment and tree-building steps [default: ' - '%default]') + '%default]'), + make_option('--num_seqs', type='int', default=10000, + help='Number of sequences to include form each input file in ' + 'each step in the convergent mode. [default:%default]'), + make_option('--convergent', action='store_true', + default=False, + help='Run the open reference workflow in convergent iterative mode. ' + 'This flag is silently ignored when not running in iterative ' + 'mode. [default: %default]'), + ] script_info['version'] = __version__ @@ -431,23 +449,45 @@ def main(): status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) else: - iterative_pick_subsampled_open_reference_otus(input_fps=input_fps, - refseqs_fp=refseqs_fp, output_dir=output_dir, - percent_subsample=percent_subsample, new_ref_set_id=new_ref_set_id, - command_handler=command_handler, params=params, - min_otu_size=opts.min_otu_size, - run_assign_tax=not opts.suppress_taxonomy_assignment, - run_align_and_tree=not opts.suppress_align_and_tree, - qiime_config=qiime_config, - prefilter_refseqs_fp=prefilter_refseqs_fp, - prefilter_percent_id=prefilter_percent_id, - step1_otu_map_fp=opts.step1_otu_map_fp, - step1_failures_fasta_fp=opts.step1_failures_fasta_fp, - parallel=parallel, suppress_step4=opts.suppress_step4, logger=None, - denovo_otu_picking_method=denovo_otu_picking_method, - reference_otu_picking_method=reference_otu_picking_method, - status_update_callback=status_update_callback, - minimum_failure_threshold=minimum_failure_threshold) + if opts.convergent: + convergent_pick_subsampled_open_reference_otus( + input_fps=input_fps, + refseqs_fp=refseqs_fp, output_dir=output_dir, + percent_subsample=percent_subsample, + new_ref_set_id=new_ref_set_id, + command_handler=command_handler, params=params, + min_otu_size=opts.min_otu_size, + run_assign_tax=not opts.suppress_taxonomy_assignment, + run_align_and_tree=not opts.suppress_align_and_tree, + qiime_config=qiime_config, + prefilter_refseqs_fp=prefilter_refseqs_fp, + prefilter_percent_id=prefilter_percent_id, + step1_otu_map_fp=opts.step1_otu_map_fp, + step1_failures_fasta_fp=opts.step1_failures_fasta_fp, + parallel=parallel, suppress_step4=opts.suppress_step4, logger=None, + denovo_otu_picking_method=denovo_otu_picking_method, + reference_otu_picking_method=reference_otu_picking_method, + status_update_callback=status_update_callback, + minimum_failure_threshold=minimum_failure_threshold, + num_seqs=opts.num_seqs) + else: + iterative_pick_subsampled_open_reference_otus(input_fps=input_fps, + refseqs_fp=refseqs_fp, output_dir=output_dir, + percent_subsample=percent_subsample, new_ref_set_id=new_ref_set_id, + command_handler=command_handler, params=params, + min_otu_size=opts.min_otu_size, + run_assign_tax=not opts.suppress_taxonomy_assignment, + run_align_and_tree=not opts.suppress_align_and_tree, + qiime_config=qiime_config, + prefilter_refseqs_fp=prefilter_refseqs_fp, + prefilter_percent_id=prefilter_percent_id, + step1_otu_map_fp=opts.step1_otu_map_fp, + step1_failures_fasta_fp=opts.step1_failures_fasta_fp, + parallel=parallel, suppress_step4=opts.suppress_step4, logger=None, + denovo_otu_picking_method=denovo_otu_picking_method, + reference_otu_picking_method=reference_otu_picking_method, + status_update_callback=status_update_callback, + minimum_failure_threshold=minimum_failure_threshold) if __name__ == "__main__": main() diff --git a/tests/test_workflow/test_pick_open_reference_otus.py b/tests/test_workflow/test_pick_open_reference_otus.py index 9ad4b4c24e..f692ab91a0 100755 --- a/tests/test_workflow/test_pick_open_reference_otus.py +++ b/tests/test_workflow/test_pick_open_reference_otus.py @@ -29,7 +29,8 @@ from qiime.workflow.pick_open_reference_otus import ( pick_subsampled_open_reference_otus, iterative_pick_subsampled_open_reference_otus, - final_repset_from_iteration_repsets) + final_repset_from_iteration_repsets, + convergent_pick_subsampled_open_reference_otus) from bfillings.sortmerna_v2 import build_database_sortmerna @@ -1966,5 +1967,315 @@ def test_final_repset_from_iteration_repsets(self): final_repset_from_iteration_repsets([repset1, repset2, repset3])) self.assertEqual(actual, exp) + def test_convergent_pick_subsampled_open_reference_otus_no_prefilter(self): + """pick_subsampled_open_reference_otus functions as expected without + prefilter in convergent mode + """ + convergent_pick_subsampled_open_reference_otus( + input_fps=[self.test_data['seqs'][0], + self.test_data['extra_seqs'][0]], + refseqs_fp=self.test_data['refseqs'][0], + output_dir=self.wf_out, + percent_subsample=0.5, + new_ref_set_id='wf.test.otu', + command_handler=call_commands_serially, + params=self.params, + qiime_config=self.qiime_config, + prefilter_refseqs_fp=None, + step1_otu_map_fp=None, + step1_failures_fasta_fp=None, + parallel=False, + suppress_step4=False, + logger=None, + status_update_callback=no_status_updates, + minimum_failure_threshold=0, + num_seqs=150) + + for i in (0, 1): + final_otu_map_fp = '%s/%d/final_otu_map.txt' % (self.wf_out, i) + final_failure_fp = '%s/%d/final_failures.txt' % (self.wf_out, i) + otu_table_fp = '%s/%d/otu_table_mc2.biom' % (self.wf_out, i) + repset_fp = '%s/%d/rep_set.fna' % (self.wf_out, i) + new_refseqs_fp = '%s/%d/new_refseqs.fna' % (self.wf_out, i) + + self.assertTrue( + exists(final_otu_map_fp), + "Final OTU map doesn't exist") + self.assertFalse( + exists(final_failure_fp), + "Final failures file shouldn't exist, but it does") + self.assertTrue(exists(otu_table_fp), "OTU table doesn't exist.") + self.assertTrue(exists(repset_fp), "Rep set doesn't exist.") + self.assertTrue( + exists(new_refseqs_fp), + "New refseqs file doesn't exist.") + + otu_table_fp = ('%s/otu_table_mc2_w_tax_no_pynast_failures.biom' + % self.wf_out) + tree_fp = '%s/rep_set.tre' % self.wf_out + aln_fp = '%s/pynast_aligned_seqs/rep_set_aligned.fasta' % self.wf_out + pynast_failures_fp = ('%s/pynast_aligned_seqs/rep_set_failures.fasta' + % self.wf_out) + iter0_otu_map_w_singletons = '%s/0/final_otu_map.txt' % self.wf_out + iter1_otu_map_w_singletons = '%s/1/final_otu_map.txt' % self.wf_out + + self.assertTrue(exists(otu_table_fp), "Final OTU table doesn't exist") + self.assertTrue(exists(tree_fp), "Final tree doesn't exist") + self.assertTrue(exists(aln_fp), "Final alignment doesn't exist") + self.assertTrue(exists(pynast_failures_fp), + "PyNAST failures file doesn't exist") + self.assertTrue(exists(iter0_otu_map_w_singletons), + "Iteration 0 OTU map with singletons doesn't exist") + self.assertTrue(exists(iter1_otu_map_w_singletons), + "Iteration 1 OTU map with singletons doesn't exist") + + # all OTUs in final OTU table occur more than once + otu_table = load_table(otu_table_fp) + for row in otu_table.iter_data(axis='observation'): + self.assertTrue(sum(row) >= 2, + "Singleton OTU detected in OTU table.") + # number of OTUs in final OTU table equals the number of seequences in + # the alignment... + self.assertEqual(len(otu_table.ids(axis='observation')), + count_seqs(aln_fp)[0]) + # ... and that number is 7 (note: this is the same as without the + # prefilter because these reads are getting filtered from the final + # otu table because they fail to align with PyNAST) + self.assertEqual(len(otu_table.ids(axis='observation')), 7) + + # sequences that are ordinarily prefiltered are in the OTU map + otu_map_seq_ids = [] + for l in open(iter0_otu_map_w_singletons, 'U'): + otu_map_seq_ids.extend(l.strip().split()[1:]) + self.assertTrue('t1_1' in otu_map_seq_ids) + self.assertTrue('p1_2' in otu_map_seq_ids) + + otu_map_seq_ids = [] + for l in open(iter1_otu_map_w_singletons, 'U'): + otu_map_seq_ids.extend(l.strip().split()[1:]) + self.assertTrue('not16S.1_130' in otu_map_seq_ids) + self.assertTrue('not16S.1_151' in otu_map_seq_ids) + + # confirm that the new reference sequences is the same length as the + # input reference sequences plus the number of new non-singleton otus + self.assertEqual(count_seqs(new_refseqs_fp)[0], + count_seqs(self.test_data['refseqs'][0])[0] + + len([o for o in otu_table.ids(axis='observation') + if o.startswith('wf.test.otu')]) + + count_seqs(pynast_failures_fp)[0]) + + # spot check a few of the otus to confirm that we're getting reference + # and new otus in the final otu map. This is done on the OTU map + # singletons get filtered before building the otu table + otu_map = fields_to_dict(open(iter0_otu_map_w_singletons)) + self.assertTrue('295053' in otu_map, + "Reference OTU (295053) is not in the final OTU map.") + self.assertTrue('42684' in otu_map, + "Failure OTU (42684) is not in the final OTU map.") + # OTU from first iteration is in final map + self.assertTrue('wf.test.otu.Iter.0.ReferenceOTU0' in otu_map, + "Failure OTU (wf.test.otu.Iter.0.ReferenceOTU0) is not" + " in the final OTU map.") + # OTU from second iteration is in final map + otu_map = fields_to_dict(open(iter1_otu_map_w_singletons)) + self.assertTrue('wf.test.otu.Iter.1.ReferenceOTU0' in otu_map, + "Failure OTU (wf.test.otu.Iter.1.ReferenceOTU0) is not" + " in the final OTU map.") + + # in the test dataset, the OTUs from the second iteration are filtered + # out due to pynast failures. Thus, we will test that OTUs from each + # iteration are in the unfiltered table + unf_table = load_table('%s/otu_table_mc2.biom' % self.wf_out) + self.assertTrue('295053' in unf_table.ids(axis='observation')) + self.assertTrue( + 'wf.test.otu.Iter.1.ReferenceOTU0' in + unf_table.ids(axis='observation')) + + # confirm that number of tips in the tree is the same as the number of + # sequences in the alignment + with open(tree_fp) as f: + num_tree_tips = len(list(TreeNode.from_newick(f).tips())) + num_align_seqs = count_seqs(aln_fp)[0] + self.assertEqual(num_tree_tips, num_align_seqs) + self.assertEqual(num_tree_tips, 7) + + # OTU table without singletons or pynast failures has same number of + # otus as there are aligned sequences + self.assertEqual(len(otu_table.ids(axis='observation')), + num_align_seqs) + + # Reference OTUs have correct taxonomy assignment + obs_idx = otu_table.index('295053', axis='observation') + self.assertEqual( + otu_table.metadata(axis='observation')[obs_idx]['taxonomy'], + ["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", + "o__Enterobacteriales", "f__Enterobacteriaceae", "g__", "s__"]) + # All observations have 'taxonomy' metadata, and are at least assigned + # to 'bacteria' + for o in otu_table.iter(axis='observation'): + self.assertTrue( + o[2]['taxonomy'][0] in ['k__Bacteria', 'Unassigned']) + + def test_convergent_pick_subsampled_open_reference_otus(self): + """pick_subsampled_open_reference_otus functions as expected with + prefilter in convergent mode + """ + convergent_pick_subsampled_open_reference_otus( + input_fps=[self.test_data['seqs'][0], + self.test_data['extra_seqs'][0]], + refseqs_fp=self.test_data['refseqs'][0], + output_dir=self.wf_out, + percent_subsample=0.5, + new_ref_set_id='wf.test.otu', + command_handler=call_commands_serially, + params=self.params, + qiime_config=self.qiime_config, + prefilter_refseqs_fp=None, + prefilter_percent_id=0.60, + step1_otu_map_fp=None, + step1_failures_fasta_fp=None, + parallel=False, + suppress_step4=False, + logger=None, + status_update_callback=no_status_updates, + minimum_failure_threshold=0, + num_seqs=150) + + for i in (0, 1): + final_otu_map_fp = '%s/%d/final_otu_map.txt' % (self.wf_out, i) + final_failure_fp = '%s/%d/final_failures.txt' % (self.wf_out, i) + otu_table_fp = '%s/%d/otu_table_mc2.biom' % (self.wf_out, i) + repset_fp = '%s/%d/rep_set.fna' % (self.wf_out, i) + new_refseqs_fp = '%s/%d/new_refseqs.fna' % (self.wf_out, i) + + self.assertTrue( + exists(final_otu_map_fp), + "Final OTU map doesn't exist") + self.assertFalse( + exists(final_failure_fp), + "Final failures file shouldn't exist, but it does") + self.assertTrue(exists(otu_table_fp), "OTU table doesn't exist.") + self.assertTrue(exists(repset_fp), "Rep set doesn't exist.") + self.assertTrue( + exists(new_refseqs_fp), + "New refseqs file doesn't exist.") + + otu_table_fp = ('%s/otu_table_mc2_w_tax_no_pynast_failures.biom' + % self.wf_out) + tree_fp = '%s/rep_set.tre' % self.wf_out + aln_fp = '%s/pynast_aligned_seqs/rep_set_aligned.fasta' % self.wf_out + pynast_failures_fp = ('%s/pynast_aligned_seqs/rep_set_failures.fasta' + % self.wf_out) + iter0_otu_map_w_singletons = '%s/0/final_otu_map.txt' % self.wf_out + iter1_otu_map_w_singletons = '%s/1/final_otu_map.txt' % self.wf_out + + self.assertTrue(exists(otu_table_fp), "Final OTU table doesn't exist") + self.assertTrue(exists(tree_fp), "Final tree doesn't exist") + self.assertTrue(exists(aln_fp), "Final alignment doesn't exist") + self.assertTrue(exists(pynast_failures_fp), + "PyNAST failurs file doesn't exist") + self.assertTrue(exists(iter0_otu_map_w_singletons), + "Iteration 0 OTU map with singletons doesn't exist") + self.assertTrue(exists(iter1_otu_map_w_singletons), + "Iteration 1 OTU map with singletons doesn't exist") + + # all OTUs in final OTU table occur more than once + otu_table = load_table(otu_table_fp) + for row in otu_table.iter_data(axis='observation'): + self.assertTrue(sum(row) >= 2, + "Singleton OTU detected in OTU table.") + # number of OTUs in final OTU table equals the number of seequences in + # the alignment... + self.assertEqual(len(otu_table.ids(axis='observation')), + count_seqs(aln_fp)[0]) + # ... and that number is 7 (note: this is the same as without the + # prefilter because these reads are getting filtered from the final + # otu table because they fail to align with PyNAST) + self.assertEqual(len(otu_table.ids(axis='observation')), 7) + + # non-16S sequences are prefiltered, so not in the OTU map + otu_map_seq_ids = [] + for l in open(iter0_otu_map_w_singletons, 'U'): + otu_map_seq_ids.extend(l.strip().split()[1:]) + self.assertFalse('t1_1' in otu_map_seq_ids) + self.assertFalse('p1_2' in otu_map_seq_ids) + self.assertFalse('not16S.1_130' in otu_map_seq_ids) + self.assertFalse('not16S.1_151' in otu_map_seq_ids) + + otu_map_seq_ids = [] + for l in open(iter1_otu_map_w_singletons, 'U'): + otu_map_seq_ids.extend(l.strip().split()[1:]) + self.assertFalse('t1_1' in otu_map_seq_ids) + self.assertFalse('p1_2' in otu_map_seq_ids) + self.assertFalse('not16S.1_130' in otu_map_seq_ids) + self.assertFalse('not16S.1_151' in otu_map_seq_ids) + + # confirm that the new reference sequences is the same length as the + # input reference sequences plus the number of new non-singleton otus + self.assertEqual(count_seqs(new_refseqs_fp)[0], + count_seqs(self.test_data['refseqs'][0])[0] + + len([o for o in otu_table.ids(axis='observation') + if o.startswith('wf.test.otu')]) + + count_seqs(pynast_failures_fp)[0]) + + # spot check a few of the otus to confirm that we're getting reference + # and new otus in the final otu map. This is done on the OTU map + # singletons get filtered before building the otu table + otu_map = fields_to_dict(open(iter0_otu_map_w_singletons)) + self.assertTrue('295053' in otu_map, + "Reference OTU (295053) is not in the final OTU map.") + self.assertTrue('42684' in otu_map, + "Failure OTU (42684) is not in the final OTU map.") + # OTU from first iteration is in final map + self.assertTrue('wf.test.otu.Iter.0.ReferenceOTU0' in otu_map, + "Failure OTU (wf.test.otu.Iter.0.ReferenceOTU0) is not" + " in the final OTU map (first iteration).") + # OTU from second iteration is in final map + # Due to prefiltering, only the reference OTU map is kept + otu_map = fields_to_dict(open(iter1_otu_map_w_singletons)) + self.assertTrue('295053' in otu_map, + "Reference OTU (295053) is not in the final OTU map " + "(second iteration).") + + # In the test dataset, the only OTU present in the second OTU table is + # a reference OTU that is also present in the first iteration. To check + # that the results form the second iteration are also present in the + # final OTU table we are going to get the counts of that OTU + iter0_otu_table_fp = '%s/%d/otu_table_mc2.biom' % (self.wf_out, 0) + iter1_otu_table_fp = '%s/%d/otu_table_mc2.biom' % (self.wf_out, 1) + + i0_counts = load_table(iter0_otu_table_fp).data( + '295053', axis='observation').sum() + i1_counts = load_table(iter1_otu_table_fp).data( + '295053', axis='observation').sum() + final_counts = load_table('%s/otu_table_mc2.biom' % self.wf_out).data( + '295053', axis='observation').sum() + self.assertEqual(final_counts, i0_counts + i1_counts) + + # confirm that number of tips in the tree is the same as the number of + # sequences in the alignment + with open(tree_fp) as f: + num_tree_tips = len(list(TreeNode.from_newick(f).tips())) + num_align_seqs = count_seqs(aln_fp)[0] + self.assertEqual(num_tree_tips, num_align_seqs) + self.assertEqual(num_tree_tips, 7) + + # OTU table without singletons or pynast failures has same number of + # otus as there are aligned sequences + self.assertEqual(len(otu_table.ids(axis='observation')), + num_align_seqs) + + # Reference OTUs have correct taxonomy assignment + obs_idx = otu_table.index('295053', axis='observation') + self.assertEqual( + otu_table.metadata(axis='observation')[obs_idx]['taxonomy'], + ["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", + "o__Enterobacteriales", "f__Enterobacteriaceae", "g__", "s__"]) + # All observations have 'taxonomy' metadata, and are at least assigned + # to 'bacteria' + for o in otu_table.iter(axis='observation'): + self.assertTrue( + o[2]['taxonomy'][0] in ['k__Bacteria', 'Unassigned']) + if __name__ == "__main__": main()