summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--test/entropy/Align2Ref.m24
-rw-r--r--test/entropy/CalculateEntropy.m15
-rw-r--r--test/entropy/CalculateProteinEntropy.m25
-rw-r--r--test/entropy/FastNWalign2.c94
-rw-r--r--test/entropy/GenomeAlignments.m31
-rw-r--r--test/entropy/GenomePairwiseDist.m98
-rw-r--r--test/entropy/RefineAlignments.m276
-rw-r--r--test/entropy/don_anal.m40
-rw-r--r--test/entropy/nwalign_mod.m637
9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m
new file mode 100644
index 0000000..0e69275
--- a/dev/null
+++ b/test/entropy/RefineAlignments.m
@@ -0,0 +1,276 @@
1function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin)
2% RefineAlignments
3% Uses an interative algorithm to improve the multiple alignments of
4% an input sequence. It will iteratively remove a fraction of the
5% sequences, compress any gaps, and then re-align. This will be done
6% for NUM_REPS.
7%
8% Written by Will Dampier. Contact at wnd22@drexel.edu
9% Version 1.0 on 10/23/09
10%
11% [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN)
12%
13% IN_ALIGN A char-array representing a multiple alignment to be
14% refined.
15%
16% OUT_ALIGN The refined alignment.
17% SCORE The new alignment score.
18%
19%
20% Optional Arguements:
21%
22% 'Alphabet' Either 'AA' or 'NT' to indicate whether this is a
23% nucleotide or amino acid alignment.
24%
25% 'GapPenalty' The penalty for gaps. This is set at -8 by
26% default.
27%
28% 'DistMat' The distance matrix to use for the alignment
29% scoring. Default is BLOSUM50 for 'AA' and NUC44
30% for 'NT'
31%
32% 'NumReps' The number of refining iterations to do.
33%
34% 'NumTry' The number of tries to perform at each iteration.
35%
36% 'RefineType' Which type of refinement to perform. This can
37% be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'.
38%
39% 'Display' A boolean indicating whether to display the output
40% at each iteration.
41%
42%
43
44
45
46
47RM_FRAC = 0.1;
48NUM_TRIES = 10;
49NUM_REPS = 100;
50alpha = 'aa';
51dist_mat = [];
52gap_penalty = -8;
53TYPE = 'mixed';
54disp_flag = true;
55
56for i = 1:2:length(varargin)
57 switch lower(varargin{i})
58
59 case 'alphabet'
60 if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1})
61 alpha = varargin{i+1};
62 else
63 error('RefineAlignments:BADALPHA', ...
64 'Arguement to "alphabet" must be "NT" or "AA"')
65 end
66 case 'gappenalty'
67 if isnumeric(varargin{i+1}) && varargin{i+1} < 0
68 gap_penalty = varargin{i+1};
69 else
70 error('RefineAlignments:BADGAP', ...
71 'Arguement to "GapPenalty" must be negative numeric')
72 end
73 case 'distmat'
74 dist_mat = varargin{i+1};
75
76 case 'numreps'
77 if isnumeric(varargin{i+1}) && varargin{i+1} > 1
78 NUM_REPS = varargin{i+1};
79 else
80 error('RefineAlignments:BADREPS', ...
81 'Arguement to NumReps must be a positive numeric')
82 end
83
84 case 'refinetype'
85 TYPE = varargin{i+1};
86
87 case 'display'
88 if islogical(varargin{i+1})
89 disp_flag = varargin{i+1};
90 else
91 error('RefineAlignments:DISPLAY', ...
92 'Arguement to Display must be a boolean.')
93 end
94
95 otherwise
96 error('RefineAlignments:BADARG', 'Unknown arguement: %s', ...
97 lower(varargin{i}))
98
99 end
100end
101
102if isempty(dist_mat)
103 if strcmpi(alpha, 'nt')
104 dist_mat = nuc44;
105 toint = @nt2int;
106 else
107 dist_mat = blosum50;
108 toint = @aa2int;
109 end
110end
111
112dist_mat(end+1,:) = gap_penalty;
113dist_mat(:,end+1) = gap_penalty;
114dist_mat(end,end) = 0;
115
116switch upper(TYPE)
117 case 'MIXED'
118 is_stoch = true;
119 is_mix = true;
120 case 'STOCASTIC'
121 is_stoch = true;
122 is_mix = false;
123 case 'DETERMINISTIC'
124 is_stoch = false;
125 is_mix = false;
126end
127
128
129OUT_ALIGN = IN_ALIGN;
130cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint);
131
132for i = 1:NUM_REPS
133
134 %determine whether to switch based on the "mix" factor
135 if is_mix
136 is_stoch = ~is_stoch;
137 end
138
139
140 rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES);
141 counter = 1;
142 while counter < NUM_TRIES
143 new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha);
144
145 new_score = CalculateScore(new_mat, dist_mat, toint);
146 if new_score > cur_score
147 if disp_flag
148 disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ...
149 num2str(abs(new_score-cur_score)), ' and by ', ...
150 num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ...
151 ' columns', ' Using stoch:', num2str(is_stoch)};
152 display([disp_cell{:}])
153 end
154 cur_score = new_score;
155 OUT_ALIGN = new_mat;
156 break
157 end
158 counter = counter + 1;
159 end
160end
161
162function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES)
163% GetRows
164% A helper function which determines the rows to split the alignment
165% with.
166%
167% MAT A char-array of the multiple alignment.
168% STOCH A boolean indicating whether to use a stochastic approach.
169% RM_FRAC The fraction of rows to remove.
170% NUM_TRIES The number of tries to return.
171%
172% rows_mat A matrix indicating which rows to remove.
173%
174
175if STOCH
176 rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC;
177 if any(all(rows_mat,1)) || any(all(~rows_mat,1))
178 cols = any(all(rows_mat,1)) || any(all(~rows_mat,1));
179 rows_mat(1,cols) = ~rows_mat(1,cols);
180 end
181
182else
183 gap_mask = MAT =='-';
184 runs = FindRuns(gap_mask);
185 score = mode(runs.*(runs~=0))-min(runs);
186 [~, order] = sort(score, 'descend');
187 rm_rows = order(1:NUM_TRIES);
188 [~, norder] = sort(runs(:,rm_rows),'descend');
189 rows_mat = false(size(MAT,1), NUM_TRIES);
190 num_remove = ceil(RM_FRAC*size(MAT,1));
191 for i = 1:NUM_TRIES
192 rows_mat(norder(1:num_remove,i),i) = true;
193 end
194end
195
196function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha)
197% SplitAndReAlign
198% This will split the alignment matix based on the provided rows and
199% then create two profiles and re-align them.
200%
201% MAT A char-array of the alignment.
202% ROWS A boolean-array describing the switch between rows
203% alpha Either 'NT' or 'AA' to indicate which sequence profile to
204% create.
205%
206% NEW_MAT The ReAlinged alignment
207%
208
209mat1 = MAT(ROWS, :);
210mat2 = MAT(~ROWS, :);
211
212mat1(:,all(mat1 == '-',1)) = [];
213mat2(:,all(mat2 == '-',1)) = [];
214
215pmat1 = seqprofile(mat1, 'alphabet', alpha);
216pmat2 = seqprofile(mat2, 'alphabet', alpha);
217try
218 [~, ind1, ind2] = profalign(pmat1, pmat2);
219catch
220 ind1 = 1:size(pmat1,2);
221 ind2 = 1:size(pmat2,2);
222end
223
224NEW_MAT = char(0);
225NEW_MAT(ROWS, ind1) = mat1;
226NEW_MAT(~ROWS, ind2) = mat2;
227NEW_MAT(~isletter(NEW_MAT)) = '-';
228NEW_MAT(all(NEW_MAT == '-',1)) = [];
229
230
231
232
233function s = CalculateScore(MAT, DIST, FUN)
234% CalculateScore
235% A function which calculates the score of the alignment. This score
236% is based on the agreement between the consensus and each individual
237% alignment.
238%
239% MAT A char-array of a multiple alignment.
240% DIST A distance matrix which indicates the penalty for
241% mismatches.
242% FUN A function which converts letter to numbers such at nt2int
243% or aa2int.
244%
245% s The score for this alingment.
246%
247%
248
249cons = seqconsensus(MAT);
250num_cons = FUN(cons);
251num_align = FUN(MAT);
252ncons = repmat(num_cons(:), [size(num_align,1), 1]);
253nalign = num_align(:);
254lookupinds = sub2ind(size(DIST), ncons, nalign);
255s = sum(DIST(lookupinds));
256
257function reverse_looking = FindRuns(input)
258% FindRuns
259% Finds consecutive runs of 1's along the rows of a boolean array.
260%
261% INPUT = [1 1 1 0 0 0 1 0 1 0 1 1;
262% 0 1 1 1 0 1 1 0 0 1 1 1];
263% RL = [1 2 3 0 0 0 1 0 1 0 1 2;
264% [0 3 2 1 0 1 2 0 0 1 2 3];
265%
266
267[m,n] = size(input);
268reverse_looking = [zeros(1,m);input.'];
269reverse_looking = reverse_looking(:);
270p = find(~reverse_looking);
271reverse_looking(p) = [0;1-diff(p)];
272reverse_looking = reshape(cumsum(reverse_looking),[],m).';
273reverse_looking(:,1) = [];
274
275
276

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.