Skip to content
This repository was archived by the owner on Nov 9, 2023. It is now read-only.
Open
236 changes: 236 additions & 0 deletions qiime/workflow/pick_open_reference_otus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"

from itertools import izip
from os import remove
from os.path import split, splitext, getsize, exists, abspath, join
from shutil import copyfile, rmtree
from numpy import inf
Expand Down Expand Up @@ -1091,3 +1093,237 @@ def pick_subsampled_open_reference_otus(input_fp,
if not suppress_index_page:
index_fp = '%s/index.html' % output_dir
generate_index_page(index_links, index_fp)


def convergent_pick_subsampled_open_reference_otus(
input_fps,
refseqs_fp,
output_dir,
percent_subsample,
new_ref_set_id,
command_handler,
params,
qiime_config,
prefilter_refseqs_fp=None,
prefilter_percent_id=None,
min_otu_size=2,
run_assign_tax=True,
run_align_and_tree=True,
step1_otu_map_fp=None,
step1_failures_fasta_fp=None,
parallel=False,
suppress_step4=False,
logger=None,
suppress_md5=False,
denovo_otu_picking_method='uclust',
reference_otu_picking_method='uclust_ref',
status_update_callback=print_to_stdout,
minimum_failure_threshold=100000,
num_seqs=10000):
"""Call the convergent open reference workflow"""
create_dir(output_dir)
commands = []

if logger is None:
logger = WorkflowLogger(generate_log_fp(output_dir), params=params,
qiime_config=qiime_config)
close_logger_on_success = True
else:
close_logger_on_success = False

# if the user has not passed a different reference collection for the
# pre-filter, used the input refseqs_fp for all iterations. we want to
# pre-filter all data against the input data as lower percent identity
# searches with uclust can be slow, so we want the reference collection to
# stay at a reasonable size.
if prefilter_refseqs_fp is None:
prefilter_refseqs_fp = refseqs_fp

otu_table_fps = []
repset_fasta_fps = []

# Construct an array of sequence generators
seq_generators = [parse_fasta(open(fp, 'U')) for fp in input_fps]

# The input files can have different sizes, making the workflow to run
# slower as less sequences are included in each iteration as the smaller
# files are being processed. In order to keep the number of sequences per
# iteration, we will be adjusting the number of seqs per file dynamically
total_num_seqs = len(seq_generators) * num_seqs

iteration = 0
# Iterate until all seqs are clustered
while len(seq_generators) > 0:
# Adjust the number of sequences per file
num_seqs = int(total_num_seqs / len(seq_generators))

iteration_output_dir = '%s/%d/' % (output_dir, iteration)
iter_input_fp = join(output_dir, "iter_%d_seqs.fna" % iteration)

next_iter_seq_gen = []

# Write the sequences for this iteration
with open(iter_input_fp, 'w') as f:

# Loop through all the input sequences
for seq_gen in seq_generators:

# We need to write 'num_seqs' from each input
count_added = 0
for i, e in izip(range(num_seqs), seq_gen):
f.write(">%s\n%s\n" % e)
count_added += 1

# More sequences still in this generator
if count_added == num_seqs:
next_iter_seq_gen.append(seq_gen)

if iteration_output_exists(iteration_output_dir, min_otu_size):
# If the output from an iteration already exists, skip that
# iteration (useful for continuing failed runs)
log_input_md5s(logger, [iter_input_fp, refseqs_fp])
logger.write(
'Iteration %s (input file: %s) output data already exists. '
'Skipping and moving to next.\n\n'
% (iteration, iter_input_fp))
else:
# At this point we have already created the new input file with
# <= num_seqs * len(seq_generator) sequences for this iteration,
# so we can run pick_subsampled_open_reference_otus
pick_subsampled_open_reference_otus(
input_fp=iter_input_fp,
refseqs_fp=refseqs_fp,
output_dir=iteration_output_dir,
percent_subsample=percent_subsample,
new_ref_set_id='%s.Iter.%d' % (new_ref_set_id, iteration),
command_handler=command_handler,
params=params,
qiime_config=qiime_config,
run_assign_tax=False,
run_align_and_tree=False,
prefilter_refseqs_fp=prefilter_refseqs_fp,
prefilter_percent_id=prefilter_percent_id,
min_otu_size=min_otu_size,
step1_otu_map_fp=step1_otu_map_fp,
step1_failures_fasta_fp=step1_failures_fasta_fp,
parallel=parallel,
suppress_step4=suppress_step4,
logger=logger,
suppress_md5=suppress_md5,
suppress_index_page=True,
denovo_otu_picking_method=denovo_otu_picking_method,
reference_otu_picking_method=reference_otu_picking_method,
status_update_callback=status_update_callback,
minimum_failure_threshold=minimum_failure_threshold)

# Perform post-iteration file shuffling whether the previous
# iteration's data previously existed or was just computed

# step1 otu map and failures can only be used for the first
# iteration as subsequent iterations need to use updated refseqs
# files
step1_otu_map_fp = step1_failures_fasta_fp = None
new_refseqs_fp = join(iteration_output_dir, 'new_refseqs.fna')
refseqs_fp = new_refseqs_fp

otu_table_fps.append(
join(iteration_output_dir,
'otu_table_mc%d.biom' % min_otu_size))
repset_fasta_fps.append(
join(iteration_output_dir, 'rep_set.fna'))

# Remove the iteration input file to avoid FS usage
remove(iter_input_fp)

iteration += 1
seq_generators = next_iter_seq_gen

# The open reference execution finished. Do some post processing to get
# the final results
otu_table_fp = join(output_dir, 'otu_table_mc%s.biom' % min_otu_size)
merge_cmd = ('merge_otu_tables.py -i %s -o %s'
% (','.join(otu_table_fps), otu_table_fp))
commands.append([("Merge OTU tables", merge_cmd)])

# Build master rep set
final_repset_fp = join(output_dir, 'rep_set.fna')
final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

command_handler(commands, status_update_callback, logger=logger,
close_logger_on_success=False)
commands = []

# Initialize output file names - these differ based on what combination of
# taxonomy assignment and alignment/tree building is happening
if run_assign_tax and run_align_and_tree:
tax_input_otu_table_fp = otu_table_fp
otu_table_w_tax_fp = join(
output_dir, 'otu_table_mc%d_w_tax.biom' % min_otu_size)
align_and_tree_input_otu_table = otu_table_w_tax_fp
pynast_failure_filtered_otu_table_fp = join(
output_dir,
'otu_table_mc%d_w_tax_no_pynast_failures.biom' % min_otu_size)
elif run_assign_tax:
tax_input_otu_table_fp = otu_table_fp
otu_table_w_tax_fp = join(output_dir,
'otu_table_mc%d_w_tax.biom' % min_otu_size)
elif run_align_and_tree:
align_and_tree_input_otu_table = otu_table_fp
pynast_failure_filtered_otu_table_fp = join(
output_dir,
'otu_table_mc%d_no_pynast_failures.biom' % min_otu_size)

if run_assign_tax:
# remove files from partially completed runs
remove_files([otu_table_w_tax_fp], error_on_missing=False)

taxonomy_fp = assign_tax(
repset_fasta_fp=final_repset_fp,
output_dir=output_dir,
command_handler=command_handler,
params=params,
qiime_config=qiime_config,
parallel=parallel,
logger=logger,
status_update_callback=status_update_callback)

# Add taxa to OTU table
add_metadata_cmd = (
'biom add-metadata -i %s --observation-metadata-fp %s -o %s '
'--sc-separated taxonomy --observation-header OTUID,taxonomy'
% (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp))
commands.append([("Add taxa to OTU table", add_metadata_cmd)])

command_handler(commands, status_update_callback, logger=logger,
close_logger_on_success=False)
commands = []

if run_align_and_tree:
# remove files form partially completed runs
remove_files([pynast_failure_filtered_otu_table_fp],
error_on_missing=False)

pynast_failures_fp = align_and_tree(
repset_fasta_fp=final_repset_fp,
output_dir=output_dir,
command_handler=command_handler,
params=params,
qiime_config=qiime_config,
parallel=parallel,
logger=logger,
status_update_callback=status_update_callback)

# Build OTU table without pynast failures
table = load_table(align_and_tree_input_otu_table)
filtered_otu_table = filter_otus_from_otu_table(
table,
get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
0, inf, 0, inf, negate_ids_to_keep=True)
write_biom_table(filtered_otu_table,
pynast_failure_filtered_otu_table_fp)

command_handler(commands, status_update_callback, logger=logger,
close_logger_on_success=False)
commands = []

logger.close()
78 changes: 59 additions & 19 deletions scripts/pick_open_reference_otus.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
call_commands_serially, print_commands, no_status_updates, print_to_stdout)
from qiime.workflow.pick_open_reference_otus import (
pick_subsampled_open_reference_otus,
iterative_pick_subsampled_open_reference_otus)
iterative_pick_subsampled_open_reference_otus,
convergent_pick_subsampled_open_reference_otus)

qiime_config = load_qiime_config()
options_lookup = get_options_lookup()
Expand Down Expand Up @@ -215,6 +216,14 @@
"-i $PWD/seqs1.fna,$PWD/seqs2.fna -r $PWD/refseqs.fna -o $PWD/ucrss_iter/ "
"-s 0.1 -p $PWD/ucrss_params.txt"))

script_info['script_usage'].append(("", "Run the subsampled open-reference "
"OTU picking workflow in convergent iterative mode on seqs1.fna and seqs2.fna using "
"refseqs.fna as the initial reference collection. ALWAYS SPECIFY ABSOLUTE "
"FILE PATHS (absolute path represented here as $PWD, but will generally "
"look something like /home/ubuntu/my_analysis/", "%prog "
"-i $PWD/seqs1.fna,$PWD/seqs2.fna -r $PWD/refseqs.fna -o $PWD/ucrss_iterc/ "
"-s 0.1 -p $PWD/ucrss_params.txt --convergent"))

script_info['script_usage'].append(("", "Run the subsampled open-reference "
"OTU picking workflow in iterative mode on seqs1.fna and seqs2.fna using "
"refseqs.fna as the initial reference collection. This is useful if "
Expand Down Expand Up @@ -322,7 +331,16 @@
make_option('--suppress_align_and_tree', action='store_true',
default=False,
help='skip the sequence alignment and tree-building steps [default: '
'%default]')
'%default]'),
make_option('--num_seqs', type='int', default=10000,
help='Number of sequences to include form each input file in '
'each step in the convergent mode. [default:%default]'),
make_option('--convergent', action='store_true',
default=False,
help='Run the open reference workflow in convergent iterative mode. '
'This flag is silently ignored when not running in iterative '
'mode. [default: %default]'),

]
script_info['version'] = __version__

Expand Down Expand Up @@ -431,23 +449,45 @@ def main():
status_update_callback=status_update_callback,
minimum_failure_threshold=minimum_failure_threshold)
else:
iterative_pick_subsampled_open_reference_otus(input_fps=input_fps,
refseqs_fp=refseqs_fp, output_dir=output_dir,
percent_subsample=percent_subsample, new_ref_set_id=new_ref_set_id,
command_handler=command_handler, params=params,
min_otu_size=opts.min_otu_size,
run_assign_tax=not opts.suppress_taxonomy_assignment,
run_align_and_tree=not opts.suppress_align_and_tree,
qiime_config=qiime_config,
prefilter_refseqs_fp=prefilter_refseqs_fp,
prefilter_percent_id=prefilter_percent_id,
step1_otu_map_fp=opts.step1_otu_map_fp,
step1_failures_fasta_fp=opts.step1_failures_fasta_fp,
parallel=parallel, suppress_step4=opts.suppress_step4, logger=None,
denovo_otu_picking_method=denovo_otu_picking_method,
reference_otu_picking_method=reference_otu_picking_method,
status_update_callback=status_update_callback,
minimum_failure_threshold=minimum_failure_threshold)
if opts.convergent:
convergent_pick_subsampled_open_reference_otus(
input_fps=input_fps,
refseqs_fp=refseqs_fp, output_dir=output_dir,
percent_subsample=percent_subsample,
new_ref_set_id=new_ref_set_id,
command_handler=command_handler, params=params,
min_otu_size=opts.min_otu_size,
run_assign_tax=not opts.suppress_taxonomy_assignment,
run_align_and_tree=not opts.suppress_align_and_tree,
qiime_config=qiime_config,
prefilter_refseqs_fp=prefilter_refseqs_fp,
prefilter_percent_id=prefilter_percent_id,
step1_otu_map_fp=opts.step1_otu_map_fp,
step1_failures_fasta_fp=opts.step1_failures_fasta_fp,
parallel=parallel, suppress_step4=opts.suppress_step4, logger=None,
denovo_otu_picking_method=denovo_otu_picking_method,
reference_otu_picking_method=reference_otu_picking_method,
status_update_callback=status_update_callback,
minimum_failure_threshold=minimum_failure_threshold,
num_seqs=opts.num_seqs)
else:
iterative_pick_subsampled_open_reference_otus(input_fps=input_fps,
refseqs_fp=refseqs_fp, output_dir=output_dir,
percent_subsample=percent_subsample, new_ref_set_id=new_ref_set_id,
command_handler=command_handler, params=params,
min_otu_size=opts.min_otu_size,
run_assign_tax=not opts.suppress_taxonomy_assignment,
run_align_and_tree=not opts.suppress_align_and_tree,
qiime_config=qiime_config,
prefilter_refseqs_fp=prefilter_refseqs_fp,
prefilter_percent_id=prefilter_percent_id,
step1_otu_map_fp=opts.step1_otu_map_fp,
step1_failures_fasta_fp=opts.step1_failures_fasta_fp,
parallel=parallel, suppress_step4=opts.suppress_step4, logger=None,
denovo_otu_picking_method=denovo_otu_picking_method,
reference_otu_picking_method=reference_otu_picking_method,
status_update_callback=status_update_callback,
minimum_failure_threshold=minimum_failure_threshold)

if __name__ == "__main__":
main()
Loading