-
Notifications
You must be signed in to change notification settings - Fork 257
ENH: avoid full dense matrix for parallel beta #2040
base: master
Are you sure you want to change the base?
Changes from 4 commits
23369b2
590c6ae
584af9b
636891f
b959ecb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,7 @@ | |
| import warnings | ||
| warnings.filterwarnings('ignore', 'Not using MPI as mpi4py not found') | ||
|
|
||
| from numpy import asarray | ||
| from numpy import vstack | ||
| import cogent.maths.distance_transform as distance_transform | ||
| from biom import load_table | ||
|
|
||
|
|
@@ -144,8 +144,6 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, | |
|
|
||
| otu_table = load_table(input_path) | ||
|
|
||
| otumtx = asarray([v for v in otu_table.iter_data(axis='sample')]) | ||
|
|
||
| if tree_path: | ||
| tree = parse_newick(open(tree_path, 'U'), | ||
| PhyloNode) | ||
|
|
@@ -173,6 +171,7 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, | |
| % (metric, ', '.join(list_known_metrics()))) | ||
| exit(1) | ||
| if rowids is None: | ||
| otumtx = otu_table.matrix_data.T.toarray() | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This probably can just use the row by row stuff done below. Fairly certain there is never a need to do a dense conversion here. |
||
| # standard, full way | ||
| if is_phylogenetic: | ||
| dissims = metric_f(otumtx, otu_table.ids(axis='observation'), | ||
|
|
@@ -198,32 +197,28 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, | |
| metric_f.__name__ == 'binary_dist_chisq': | ||
| warnings.warn('dissimilarity ' + metric_f.__name__ + | ||
| ' is not parallelized, calculating the whole matrix...') | ||
| otumtx = otu_table.matrix_data.T.toarray() | ||
| row_dissims.append(metric_f(otumtx)[rowidx]) | ||
| else: | ||
| try: | ||
| row_metric = get_phylogenetic_row_metric(metric) | ||
| except AttributeError: | ||
| # do element by element | ||
| dissims = [] | ||
| sample_ids = otu_table.ids() | ||
| observation_ids = otu_table.ids(axis='observation') | ||
| for i in range(len(sample_ids)): | ||
| if is_phylogenetic: | ||
| dissim = metric_f( | ||
| otumtx[[rowidx, i], :], observation_ids, | ||
| tree, [sample_ids[rowidx], sample_ids[i]], | ||
| make_subtree=(not full_tree))[0, 1] | ||
| else: | ||
| dissim = metric_f(otumtx[[rowidx, i], :])[0, 1] | ||
| dissims.append(dissim) | ||
| row_dissims.append(dissims) | ||
| else: | ||
| # do whole row at once | ||
| dissims = row_metric(otumtx, | ||
| otu_table.ids(axis='observation'), | ||
| tree, otu_table.ids(), rowid, | ||
| make_subtree=(not full_tree)) | ||
| row_dissims.append(dissims) | ||
| # do element by element | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ElDeveloper, wanna give this another spin? I verified with @gregcaporaso, I'm really not sure if this change is safe for all the metrics, but for some reason,
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wasade, I think this might be it thanks for having a look, I started it On (Jul-23-15|11:42), Daniel McDonald wrote:
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💥 |
||
| dissims = [] | ||
| sample_ids = otu_table.ids() | ||
| observation_ids = otu_table.ids(axis='observation') | ||
| for i in range(len(sample_ids)): | ||
| samp_a = otu_table.data(sample_ids[rowidx]) | ||
| samp_b = otu_table.data(sample_ids[i]) | ||
| samp_data = vstack([samp_a, samp_b]) | ||
|
|
||
| if is_phylogenetic: | ||
|
|
||
| dissim = metric_f( | ||
| samp_data, observation_ids, | ||
| tree, [sample_ids[rowidx], sample_ids[i]], | ||
| make_subtree=(not full_tree))[0, 1] | ||
| else: | ||
| dissim = metric_f(samp_data)[0, 1] | ||
| dissims.append(dissim) | ||
| row_dissims.append(dissims) | ||
|
|
||
| with open(outfilepath, 'w') as f: | ||
| f.write(format_matrix(row_dissims, rowids_list, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would probably
import numpy as np, but this is also ok.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no strong feelings here, can do if you'd like but the iteration with jenkins is large
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
True, no strong feelings either ... Just ignore that comment.
Yoshiki Vázquez-Baeza