summaryrefslogtreecommitdiffstats
path: root/test/entropy/GenomePairwiseDist.m (plain)
blob: 544db0a7e849ba5fa25d8184b817b16d12ffadb0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
%   GenomePairwiseDist
%       This will align all of the protein segments in the genome structure
%       and return a cell-array.  Each cell is the alignment of one
%       protein.  If the protein is missing then the distance is NaN.
%

if nargin < 1
	ALIGN_NAME = 'saved_alignments';
end

try
    display('Loading Previous Alignments')
    d = load(ALIGN_NAME);
    saved_dists = d.DIST_CELL;
    saved_headers = d.header_names;
catch %#ok<CTCH>
    display('No previous alignments found')
    saved_headers = {};
    saved_dists = zeros(numel(IN_PROTS));
end

NUM_LETTERS = 4000000;


[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);

display('Starting Alignments')

dist_mat = NaN(size(IN_PROTS,1));

[tf loc]=ismember(header_names, saved_headers);

dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
[I J] = find(triu(isnan(dist_mat)));
inds = sub2ind(size(dist_mat), I, J);

sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);

%find all of the ones that are not worth aligning
bad_mask = sizes(I) <= 1 | sizes(J) <= 1;
dist_mat(inds(bad_mask)) = Inf;
I(bad_mask) = [];
J(bad_mask) = [];

p_sizes = sizes(I)+sizes(J);
counter = 1;
display('Aligning...')
tic
while counter < length(I)
    cum_sizes = cumsum(p_sizes(counter+1:end));
    top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');

    I_spots = I(counter:top);
    J_spots = J(counter:top);
    need_align = cell(length(I_spots),2);
    [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
    [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
    dist = zeros(size(I_spots));

	%parfor k = 1:size(need_align,1)
    for k = 1:size(need_align,1)
        %display([num2str(k) ' ' num2str(toc)])
        try
            dist(k)=nwalign_mod(need_align{k,:});
        catch %#ok<CTCH>
            dist(k) = Inf;
        end
    end
    time = toc;
    inds = sub2ind(size(dist_mat), I_spots,J_spots);
    dist_mat(inds) = dist;
    counter = top+1;
    display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
end

DIST_CELL = dist_mat;
save(ALIGN_NAME, 'DIST_CELL', 'header_names')
DIST_OUT = dfun(DIST_CELL);





function out_d = dfun(MAT)

score12 = triu(MAT,1);
self_d = diag(MAT);
d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));

out_d = triu(d,1) + triu(d,1)';







Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.