summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--test/entropy/Align2Ref.m24
-rw-r--r--test/entropy/CalculateEntropy.m15
-rw-r--r--test/entropy/CalculateProteinEntropy.m25
-rw-r--r--test/entropy/FastNWalign2.c94
-rw-r--r--test/entropy/GenomeAlignments.m31
-rw-r--r--test/entropy/GenomePairwiseDist.m98
-rw-r--r--test/entropy/RefineAlignments.m276
-rw-r--r--test/entropy/don_anal.m40
-rw-r--r--test/entropy/nwalign_mod.m637
9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/nwalign_mod.m b/test/entropy/nwalign_mod.m
new file mode 100644
index 0000000..76aade6
--- a/dev/null
+++ b/test/entropy/nwalign_mod.m
@@ -0,0 +1,637 @@
1function [score, alignment, startat, matrices] = nwalign(seq1,seq2,varargin)
2%NWALIGN performs Needleman-Wunsch global alignment of two sequences.
3%
4% NWALIGN(SEQ1, SEQ2) returns the score (in bits) for the optimal
5% alignment. Note: The scale factor used to calculate the score is
6% provided by the scoring matrix info (see below). If this is not
7% defined, then NWALIGN returns the raw score.
8%
9% [SCORE, ALIGNMENT] = NWALIGN(SEQ1, SEQ2) returns a string showing an
10% optimal global alignment of amino acid (or nucleotide) sequences SEQ1
11% and SEQ2.
12%
13% [SCORE, ALIGNMENT, STARTAT] = NWALIGN(SEQ1, SEQ2) returns a 2x1 vector
14% with the starting point indices indicating the starting point of the
15% alignment in the two sequences. Note: this output is for consistency
16% with SWALIGN and will always be [1;1] because this is a global
17% alignment.
18%
19% NWALIGN(..., 'ALPHABET', A) specifies whether the sequences are
20% amino acids ('AA') or nucleotides ('NT'). The default is AA.
21%
22% NWALIGN(..., 'SCORINGMATRIX', matrix) defines the scoring matrix to be
23% used for the alignment. The default is BLOSUM50 for AA or NUC44 for NT.
24%
25% NWALIGN(..., 'SCALE' ,scale) indicates the scale factor of the scoring
26% matrix to return the score using arbitrary units. If the scoring matrix
27% Info also provides a scale factor, then both are used.
28%
29% NWALIGN(..., 'GAPOPEN', penalty) defines the penalty for opening a gap
30% in the alignment. The default gap open penalty is 8.
31%
32% NWALIGN(..., 'EXTENDGAP', penalty) defines the penalty for extending a
33% gap in the alignment. If EXTENDGAP is not specified, then extensions to
34% gaps are scored with the same value as GAPOPEN.
35%
36% NWALIGN(..., 'SHOWSCORE', true) displays the scoring space and the
37% winning path.
38%
39%
40% Examples:
41%
42% % Return the score in bits and the global alignment using the
43% % default scoring matrix (BLOSUM50).
44% [score, align] = nwalign('VSPAGMASGYD', 'IPGKASYD')
45%
46% % Use user-specified scoring matrix and "gap open" penalty.
47% [score, align] = nwalign('IGRHRYHIGG', 'SRYIGRG',...
48% 'scoringmatrix', @pam250, 'gapopen',5)
49%
50% % Return the score in nat units (nats).
51% [score, align] = nwalign('HEAGAWGHEE', 'PAWHEAE', 'scale', log(2))
52%
53% % Display the scoring space and the winning path.
54% nwalign('VSPAGMASGYD', 'IPGKASYD', 'showscore', true)
55%
56% See also BLOSUM, MULTIALIGN, NT2AA, PAM, PROFALIGN, SEQDOTPLOT,
57% SHOWALIGNMENT, SWALIGN.
58
59% References:
60% R. Durbin, S. Eddy, A. Krogh, and G. Mitchison. Biological Sequence
61% Analysis. Cambridge UP, 1998.
62% Needleman, S. B., Wunsch, C. D., J. Mol. Biol. (1970) 48:443-453
63
64% Copyright 2002-2006 The MathWorks, Inc.
65% $Revision: 1.22.6.10 $ $Date: 2006/05/17 20:48:28 $
66
67gapopen = -8;
68gapextend = -8;
69setGapExtend = false;
70showscore=false;
71isAminoAcid = true;
72scale=1;
73
74% If the input is a structure then extract the Sequence data.
75if isstruct(seq1)
76 try
77 seq1 = seqfromstruct(seq1);
78 catch
79 rethrow(lasterror);
80 end
81end
82if isstruct(seq2)
83 try
84 seq2 = seqfromstruct(seq2);
85 catch
86 rethrow(lasterror);
87 end
88end
89if nargin > 2
90 if rem(nargin,2) == 1
91 error('Bioinfo:IncorrectNumberOfArguments',...
92 'Incorrect number of arguments to %s.',mfilename);
93 end
94 okargs = {'scoringmatrix','gapopen','extendgap','alphabet','scale','showscore'};
95 for j=1:2:nargin-2
96 pname = varargin{j};
97 pval = varargin{j+1};
98 k = strmatch(lower(pname), okargs); %#ok
99 if isempty(k)
100 error('Bioinfo:UnknownParameterName',...
101 'Unknown parameter name: %s.',pname);
102 elseif length(k)>1
103 error('Bioinfo:AmbiguousParameterName',...
104 'Ambiguous parameter name: %s.',pname);
105 else
106 switch(k)
107 case 1 % scoring matrix
108 if isnumeric(pval)
109 ScoringMatrix = pval;
110 else
111 if ischar(pval)
112 pval = lower(pval);
113 end
114 try
115 [ScoringMatrix,ScoringMatrixInfo] = feval(pval);
116 catch
117 error('Bioinfo:InvalidScoringMatrix','Invalid scoring matrix.');
118 end
119 end
120 case 2 %gap open penalty
121 gapopen = -pval;
122 case 3 %gap extend penalty
123 gapextend = -pval;
124 setGapExtend = true;
125 case 4 %if sequence is nucleotide
126 if strcmpi(pval,'nt')
127 isAminoAcid = false;
128 end
129 case 5 % scale
130 scale=pval;
131 case 6 % showscore
132 showscore = pval == true;
133 end
134 end
135 end
136end
137
138% setting the default scoring matrix
139if ~exist('ScoringMatrix','var')
140 if isAminoAcid
141 [ScoringMatrix,ScoringMatrixInfo] = blosum50;
142 else
143 [ScoringMatrix,ScoringMatrixInfo] = nuc44;
144 end
145end
146
147
148% getting the scale from ScoringMatrixInfo, if it exists
149if exist('ScoringMatrixInfo','var') && isfield(ScoringMatrixInfo,'Scale')
150 scale=scale*ScoringMatrixInfo.Scale;
151end
152
153% handle properly "?" characters typically found in pdb files
154if isAminoAcid
155 if ischar(seq1)
156 seq1 = strrep(seq1,'?','X');
157 else
158 seq1(seq1 == 26) = 23;
159 end
160 if ischar(seq2)
161 seq2 = strrep(seq2,'?','X');
162 else
163 seq2(seq2 == 26) = 23;
164 end
165end
166
167% check input sequences
168% if isAminoAcid && ~(isaa(seq1) && isaa(seq2))
169% error('Bioinfo:InvalidAminoAcidSequences',...
170% 'Both sequences must be amino acids, use ALPHABET = ''NT'' for aligning nucleotides.');
171% elseif ~isAminoAcid && ~(isnt(seq1) && isnt(seq2))
172% error('Bioinfo:InvalidNucleotideSequences',...
173% 'When ALPHABET = ''NT'', both sequences must be nucleotides.');
174% end
175
176% use numerical arrays for easy indexing
177if ischar(seq1)
178 seq1=upper(seq1); %the output alignment will be all uppercase
179 if isAminoAcid
180 intseq1 = aa2int(seq1);
181 else
182 intseq1 = nt2int(seq1);
183 end
184else
185 intseq1=seq1;
186 if isAminoAcid
187 seq1 = int2aa(intseq1);
188 else
189 seq1 = int2nt(intseq1);
190 end
191end
192if ischar(seq2)
193 seq2=upper(seq2); %the output alignment will be all uppercase
194 if isAminoAcid
195 intseq2 = aa2int(seq2);
196 else
197 intseq2 = nt2int(seq2);
198 end
199else
200 intseq2=seq2;
201 if isAminoAcid
202 seq2 = int2aa(intseq2);
203 else
204 seq2 = int2nt(intseq2);
205 end
206end
207
208
209m = length(seq1);
210n = length(seq2);
211if ~n||~m
212 error('Bioinfo:InvalidLengthSequences','Length of input sequences must be greater than 0');
213end
214
215% If unknown, ambiguous or gaps appear in the sequence, we need to make
216% sure that ScoringMatrix can handle them.
217
218% possible values are
219% B Z X * - ?
220% 21 22 23 24 25 26
221
222scoringMatrixSize = size(ScoringMatrix,1);
223
224highestVal = max([intseq1, intseq2]);
225if highestVal > scoringMatrixSize
226 % if the matrix contains the 'Any' we map to that
227 if isAminoAcid
228 anyVal = aa2int('X');
229 else
230 anyVal = nt2int('N');
231 end
232 if scoringMatrixSize >= anyVal
233 intseq1(intseq1>scoringMatrixSize) = anyVal;
234 intseq2(intseq2>scoringMatrixSize) = anyVal;
235 else
236 error('Bioinfo:InvalidSymbolsInInputSequeces',...
237 'Sequences contain symbols that cannot be handled by the given scoring matrix.');
238 end
239end
240
241if setGapExtend % call more complicated algorithm if we have
242 [F, pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend);
243else
244% [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen);
245 scoredMatchMat=[ScoringMatrix(intseq2,intseq1(:)) zeros(n,1); zeros(1,m+1)];
246 [F pointer]=FastNWalign2(scoredMatchMat,gapopen);
247end
248
249% trace back through the pointer matrix
250halfn=ceil((n+1)/2);
251halfm=ceil((m+1)/2);
252
253i = n+1; j = m+1;
254path = zeros(n+m,2);
255step = 1;
256
257score = max(F(n+1,m+1,:));
258
259if setGapExtend
260
261 if F(n+1,m+1,3)==score % favor with left-gap
262 laststate=3;
263 elseif F(n+1,m+1,2)==score % then with up-gap
264 laststate=2;
265 else % at last with match
266 laststate=1;
267 end
268
269 while (i>halfn && j>halfm) % in the rigth half favor gaps when several
270 % paths lead to the highest score
271 state=laststate;
272 if bitget(pointer(i,j,state),3)
273 laststate=3;
274 elseif bitget(pointer(i,j,state),2)
275 laststate=2;
276 else
277 laststate=1;
278 end
279
280 switch state
281 case 1 % is diagonal
282 j = j - 1;
283 i = i - 1;
284 path(step,:) = [j,i];
285 case 2 % is up
286 i = i - 1;
287 path(step,2) = i;
288 case 3 % is left
289 j = j - 1;
290 path(step,1) = j;
291 end
292 step = step +1;
293 end
294
295 while (i>1 || j>1) % in the rigth half favor matchs when several
296 % paths lead to the highest score
297 state=laststate;
298 if bitget(pointer(i,j,state),1)
299 laststate=1;
300 elseif bitget(pointer(i,j,state),2)
301 laststate=2;
302 else
303 laststate=3;
304 end
305
306 switch state
307 case 1 % is diagonal
308 j = j - 1;
309 i = i - 1;
310 path(step,:) = [j,i];
311 case 2 % is up
312 i = i - 1;
313 path(step,2) = i;
314 case 3 % is left
315 j = j - 1;
316 path(step,1) = j;
317 end
318 step = step +1;
319 end
320
321else % ~setGapExtend
322
323 while (i > 1 || j > 1)
324
325 switch pointer(i,j)
326 case 1 % diagonal only
327 j = j - 1;
328 i = i - 1;
329 path(step,:) = [j,i];
330 case 2 % up only
331 i = i - 1;
332 path(step,2) = i;
333 case 4 % left only
334 j = j - 1;
335 path(step,1) = j;
336 case 6 % up or left --> up (favors gaps in seq2)
337 j = j - 1;
338 path(step,1) = j;
339 otherwise %3 diagonal or up --> diagonal (favors no gaps)
340 %4 diagonal or left --> diagonal (favors no gaps)
341 %7 diagonal or left or up --> diagonal (favors no gaps)
342 j = j - 1;
343 i = i - 1;
344 path(step,:) = [j,i];
345 end
346 step = step +1;
347 end
348
349end % if setGapExtend
350
351% re-scaling the output score
352score = scale * score;
353
354if nargout<=1 && ~showscore
355 return
356end
357
358path(step:end,:) = []; % step-1 is the length of the alignment
359path = flipud(path);
360
361% setting the size of the alignment
362alignment = repmat(('- -')',1,step-1);
363
364% adding sequence to alignment
365alignment(1,path(:,1)>0) = seq1;
366alignment(3,path(:,2)>0) = seq2;
367
368% find locations where there are no gaps
369h=find(all(path>0,2));
370if isAminoAcid
371 noGaps1=aa2int(alignment(1,h));
372 noGaps2=aa2int(alignment(3,h));
373else
374 noGaps1=nt2int(alignment(1,h));
375 noGaps2=nt2int(alignment(3,h));
376end
377
378% erasing symbols that can not be scored
379htodel=max([noGaps1;noGaps2])>scoringMatrixSize;
380h(htodel)=[];
381noGaps1(htodel)=[];
382noGaps2(htodel)=[];
383% score pairs with no gap
384value = ScoringMatrix(sub2ind(size(ScoringMatrix),double(noGaps1),double(noGaps2)));
385
386% insert symbols of the match string into the alignment
387alignment(2,h(value>=0)) = ':';
388alignment(2,h(noGaps1==noGaps2)) = '|';
389
390startat = [1;1];
391
392% undocumented fourth output -- score and pointer matrices
393if nargout > 3
394 matrices.Scores = F;
395 matrices.Pointers = pointer;
396end
397
398if showscore
399 figure
400 F=scale.*max(F(2:end,2:end,:),[],3);
401 clim=max(max(max(abs(F(~isinf(F))))),eps);
402 imagesc(F,[-clim clim]);
403 colormap(privateColorMap(1));
404 set(colorbar,'YLim',[min([F(:);-eps]) max([F(:);eps])])
405 title('Score for best path')
406 xlabel('Sequence 1')
407 ylabel('Sequence 2')
408 hold on
409 plot(path(all(path>0,2),1),path(all(path>0,2),2),'k.')
410end
411%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
412% function [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gap)
413% % Standard Needleman-Wunsch algorithm
414%
415% %LOOP IMPLEMENTED IN FastNWalign2.c
416%
417% % set up storage for dynamic programming matrix
418% F = zeros(n+1,m+1);
419% F(2:end,1) = gap * (1:n)';
420% F(1,2:end) = gap * (1:m);
421%
422%
423% %and for the back tracing matrix
424% pointer= repmat(uint8(4),n+1,m+1);
425% pointer(:,1) = 2; % up
426% pointer(1,1) = 1;
427%
428%
429% %initialize buffers to the first column
430% ptr = pointer(:,2); % ptr(1) is always 4
431% currentFColumn = F(:,1);
432%
433%
434% %main loop runs through the matrix looking for maximal scores
435%
436% for outer = 2:m+1
437%
438% %score current column
439% scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
440% %grab the data from the matrices and initialize some values
441% lastFColumn = F(:,outer-1);
442% currentFColumn = F(:,outer);
443% best = currentFColumn(1);
444%
445% %[F(:,outer) pointer(:,outer)]=FastNWalign(lastFColumn,currentFColumn,scoredMatchColumn,gap);
446%
447% for inner = 2:n+1
448% % score the three options
449% up = best + gap;
450% left = lastFColumn(inner) + gap;
451% diagonal = lastFColumn(inner-1) + scoredMatchColumn(inner-1);
452%
453% % max could be used here but it is quicker to use if statements
454% if up > left
455% best = up;
456% pos = 2;
457% else
458% best = left;
459% pos = 4;
460% end
461%
462% if diagonal >= best
463% best = diagonal;
464% ptr(inner) = 1;
465% else
466% ptr(inner) = pos;
467% end
468% currentFColumn(inner) = best;
469%
470% end % inner
471% %put back updated columns
472%
473% % save columns of pointers
474% F(:,outer) = currentFColumn;
475% % save columns of pointers
476% pointer(:,outer) = ptr;
477%
478% end % outer
479%
480% end
481%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
482 function [F,pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend)
483 % Needleman-Wunsch algorithm modified to handle affine gaps
484
485 % Set states
486 inAlign = 1;
487 inGapUp = 2;
488 inGapLeft = 3;
489 numStates = 3;
490
491 % Set up storage for dynamic programming matrix:
492 % for keeping the maximum scores for every state
493
494 F = zeros(n+1,m+1,numStates);
495 F(:,1,:) = -inf;
496 F(1,:,:) = -inf;
497 F(1,1,inAlign) = 0;
498
499 F(2:end,1,inGapUp) = gapopen + gapextend * (0:n-1)';
500 F(1,2:end,inGapLeft) = gapopen + gapextend * (0:m-1);
501
502 % and for the back tracing pointers
503 pointer(n+1,m+1,numStates) = uint8(0);
504 pointer(2:end,1,inGapUp) = 2; % up
505 pointer(1,2:end,inGapLeft) = 4; % left
506
507 % initialize buffers to the first column
508 ptrA = pointer(:,1,inAlign);
509 ptrU = pointer(:,1,inGapLeft);
510 ptrL = pointer(:,1,inGapUp);
511
512 currentFColumnA = F(:,1,inAlign);
513 currentFColumnU = F(:,1,inGapUp);
514 currentFColumnL = F(:,1,inGapLeft);
515
516 % main loop runs through the matrix looking for maximal scores
517 for outer = 2:m+1
518 % score current column
519 scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
520 % grab the data from the matrices and initialize some values for the
521 % first row the most orderly possible
522 lastFColumnA = currentFColumnA;
523 currentFColumnA = F(:,outer,inAlign);
524 bestA = currentFColumnA(1);
525 currentinA = lastFColumnA(1);
526
527 lastFColumnU = currentFColumnU;
528 currentFColumnU = F(:,outer,inGapUp);
529 bestU = currentFColumnU(1);
530
531 lastFColumnL = currentFColumnL;
532 currentFColumnL = F(:,outer,inGapLeft);
533 currentinGL = lastFColumnL(1);
534
535 for inner = 2:n+1
536
537 % grab the data from the columns the most orderly possible
538 upOpen = bestA + gapopen;
539 inA = currentinA;
540 currentinA = lastFColumnA(inner);
541 leftOpen = currentinA + gapopen;
542
543 inGL = currentinGL;
544 currentinGL = lastFColumnL(inner);
545 leftExtend = currentinGL + gapextend;
546
547 upExtend = bestU + gapextend;
548 inGU = lastFColumnU(inner-1);
549
550 % operate state 'inGapUp'
551
552 if upOpen > upExtend
553 bestU = upOpen; ptr = 1; % diagonal
554 elseif upOpen < upExtend
555 bestU = upExtend; ptr = 2; % up
556 else % upOpen == upExtend
557 bestU = upOpen; ptr = 3; % diagonal and up
558 end
559 currentFColumnU(inner)=bestU;
560 ptrU(inner)=ptr;
561
562 % operate state 'inGapLeft'
563
564 if leftOpen > leftExtend
565 bestL = leftOpen; ptr = 1; % diagonal
566 elseif leftOpen < leftExtend
567 bestL = leftExtend; ptr = 4; % left
568 else % leftOpen == leftExtend
569 bestL = leftOpen; ptr = 5; % diagonal and left
570 end
571 currentFColumnL(inner) = bestL;
572 ptrL(inner) = ptr;
573
574 % operate state 'inAlign'
575
576 if inA > inGU
577 if inA > inGL
578 bestA = inA; ptr = 1; % diagonal
579 elseif inGL > inA
580 bestA = inGL; ptr = 4; % left
581 else
582 bestA = inA; ptr = 5; % diagonal and left
583 end
584 elseif inGU > inA
585 if inGU > inGL
586 bestA = inGU; ptr = 2; % up
587 elseif inGL > inGU
588 bestA = inGL; ptr = 4; % left
589 else
590 bestA = inGU; ptr = 6; % up & left
591 end
592 else
593 if inA > inGL
594 bestA = inA; ptr = 3; % diagonal & up
595 elseif inGL > inA
596 bestA = inGL; ptr = 4; % left
597 else
598 bestA = inA; ptr = 7; % all
599 end
600 end
601
602 bestA = bestA + scoredMatchColumn(inner-1);
603 currentFColumnA(inner) = bestA;
604 ptrA(inner) = ptr;
605
606 end %inner
607
608 % put back updated columns
609 F(:,outer,inGapLeft) = currentFColumnL;
610 F(:,outer,inGapUp) = currentFColumnU;
611 F(:,outer,inAlign) = currentFColumnA;
612 % save columns of pointers
613 pointer(:,outer,inAlign) = ptrA;
614 pointer(:,outer,inGapUp) = ptrU;
615 pointer(:,outer,inGapLeft)= ptrL;
616 end %outer
617 end
618 function pcmap = privateColorMap(selection)
619 %PRIVATECOLORMAP returns a custom color map
620 switch selection
621 case 1, pts = [0 0 .3 20;
622 0 .1 .8 25;
623 0 .9 .5 15;
624 .9 1 .9 8;
625 1 1 0 26;
626 1 0 0 26;
627 .4 0 0 0];
628 otherwise, pts = [0 0 0 128; 1 1 1 0];
629 end
630 xcl=1;
631 for i=1:size(pts,1)-1
632 xcl=[xcl,i+1/pts(i,4):1/pts(i,4):i+1];
633 end
634 pcmap = interp1(pts(:,1:3),xcl);
635 end
636end
637

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.