9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/nwalign_mod.m b/test/entropy/nwalign_mod.m
new file mode 100644
index 0000000..76aade6
--- a/dev/null
+++ b/test/entropy/nwalign_mod.m
@@ -0,0 +1,637 @@
+function [score, alignment, startat, matrices] = nwalign(seq1,seq2,varargin)
+%NWALIGN performs Needleman-Wunsch global alignment of two sequences.
+%
+%   NWALIGN(SEQ1, SEQ2) returns the score (in bits) for the optimal
+%   alignment. Note: The scale factor used to calculate the score is
+%   provided by the scoring matrix info (see below). If this is not
+%   defined, then NWALIGN returns the raw score.
+%
+%   [SCORE, ALIGNMENT] = NWALIGN(SEQ1, SEQ2) returns a string showing an
+%   optimal global alignment of amino acid (or nucleotide) sequences SEQ1
+%   and SEQ2.
+%
+%   [SCORE, ALIGNMENT, STARTAT] = NWALIGN(SEQ1, SEQ2)  returns a 2x1 vector
+%   with the starting point indices indicating the starting point of the
+%   alignment in the two sequences. Note: this output is for consistency
+%   with SWALIGN and will always be [1;1] because this is a global
+%   alignment.
+%
+%   NWALIGN(..., 'ALPHABET', A) specifies whether the sequences are
+%   amino acids ('AA') or nucleotides ('NT'). The default is AA.
+%
+%   NWALIGN(..., 'SCORINGMATRIX', matrix) defines the scoring matrix to be
+%   used for the alignment. The default is BLOSUM50 for AA or NUC44 for NT.
+%
+%   NWALIGN(..., 'SCALE' ,scale) indicates the scale factor of the scoring
+%   matrix to return the score using arbitrary units. If the scoring matrix
+%   Info also provides a scale factor, then both are used.
+%
+%   NWALIGN(..., 'GAPOPEN', penalty) defines the penalty for opening a gap
+%   in the alignment. The default gap open penalty is 8.
+%
+%   NWALIGN(..., 'EXTENDGAP', penalty) defines the penalty for extending a
+%   gap in the alignment. If EXTENDGAP is not specified, then extensions to
+%   gaps are scored with the same value as GAPOPEN.
+%
+%   NWALIGN(..., 'SHOWSCORE', true) displays the scoring space and the
+%   winning path.
+%
+%
+%   Examples:
+%
+%       % Return the score in bits and the global alignment using the
+%       % default scoring matrix (BLOSUM50).
+%       [score, align] = nwalign('VSPAGMASGYD', 'IPGKASYD')
+%
+%       % Use user-specified scoring matrix and "gap open" penalty.
+%       [score, align] = nwalign('IGRHRYHIGG', 'SRYIGRG',...
+%                               'scoringmatrix', @pam250, 'gapopen',5)
+%
+%       % Return the score in nat units (nats).
+%       [score, align] = nwalign('HEAGAWGHEE', 'PAWHEAE', 'scale', log(2))
+%
+%       % Display the scoring space and the winning path.
+%       nwalign('VSPAGMASGYD', 'IPGKASYD', 'showscore', true)
+%
+%   See also BLOSUM, MULTIALIGN, NT2AA, PAM, PROFALIGN, SEQDOTPLOT,
+%   SHOWALIGNMENT, SWALIGN.
+
+%   References:
+%   R. Durbin, S. Eddy, A. Krogh, and G. Mitchison. Biological Sequence
+%   Analysis. Cambridge UP, 1998.
+%   Needleman, S. B., Wunsch, C. D., J. Mol. Biol. (1970) 48:443-453
+
+%   Copyright 2002-2006 The MathWorks, Inc.
+%   $Revision: 1.22.6.10 $  $Date: 2006/05/17 20:48:28 $
+
+gapopen = -8;
+gapextend = -8;
+setGapExtend = false;
+showscore=false;
+isAminoAcid = true;
+scale=1;
+
+% If the input is a structure then extract the Sequence data.
+if isstruct(seq1)
+    try
+        seq1 = seqfromstruct(seq1);
+    catch
+        rethrow(lasterror);
+    end
+end
+if isstruct(seq2)
+    try
+        seq2 = seqfromstruct(seq2);
+    catch
+        rethrow(lasterror);
+    end
+end
+if nargin > 2
+    if rem(nargin,2) == 1
+        error('Bioinfo:IncorrectNumberOfArguments',...
+            'Incorrect number of arguments to %s.',mfilename);
+    end
+    okargs = {'scoringmatrix','gapopen','extendgap','alphabet','scale','showscore'};
+    for j=1:2:nargin-2
+        pname = varargin{j};
+        pval = varargin{j+1};
+        k = strmatch(lower(pname), okargs); %#ok
+        if isempty(k)
+            error('Bioinfo:UnknownParameterName',...
+                'Unknown parameter name: %s.',pname);
+        elseif length(k)>1
+            error('Bioinfo:AmbiguousParameterName',...
+                'Ambiguous parameter name: %s.',pname);
+        else
+            switch(k)
+                case 1  % scoring matrix
+                    if isnumeric(pval)
+                        ScoringMatrix = pval;
+                    else
+                        if ischar(pval)
+                            pval = lower(pval);
+                        end
+                        try
+                            [ScoringMatrix,ScoringMatrixInfo] = feval(pval);
+                        catch
+                            error('Bioinfo:InvalidScoringMatrix','Invalid scoring matrix.');
+                        end
+                    end
+                case 2 %gap open penalty
+                    gapopen = -pval;
+                case 3 %gap extend penalty
+                    gapextend = -pval;
+                    setGapExtend = true;
+                case 4 %if sequence is nucleotide
+                    if strcmpi(pval,'nt')
+                        isAminoAcid = false;
+                    end
+                case 5 % scale
+                    scale=pval;
+                case 6 % showscore
+                    showscore = pval == true;
+            end
+        end
+    end
+end
+
+% setting the default scoring matrix
+if ~exist('ScoringMatrix','var')
+    if isAminoAcid
+        [ScoringMatrix,ScoringMatrixInfo] = blosum50;
+    else
+        [ScoringMatrix,ScoringMatrixInfo] = nuc44;
+    end
+end
+
+
+% getting the scale from ScoringMatrixInfo, if it exists
+if exist('ScoringMatrixInfo','var') && isfield(ScoringMatrixInfo,'Scale')
+    scale=scale*ScoringMatrixInfo.Scale;
+end
+
+% handle properly "?" characters typically found in pdb files
+if isAminoAcid
+    if ischar(seq1)
+        seq1 = strrep(seq1,'?','X');
+    else
+        seq1(seq1 == 26) = 23;
+    end
+    if ischar(seq2)
+        seq2 = strrep(seq2,'?','X');
+    else
+        seq2(seq2 == 26) = 23;
+    end
+end
+
+% check input sequences
+% if isAminoAcid && ~(isaa(seq1) && isaa(seq2))
+%     error('Bioinfo:InvalidAminoAcidSequences',...
+%         'Both sequences must be amino acids, use ALPHABET = ''NT'' for aligning nucleotides.');
+% elseif ~isAminoAcid && ~(isnt(seq1) && isnt(seq2))
+%     error('Bioinfo:InvalidNucleotideSequences',...
+%         'When ALPHABET = ''NT'', both sequences must be nucleotides.');
+% end
+
+% use numerical arrays for easy indexing
+if ischar(seq1)
+    seq1=upper(seq1); %the output alignment will be all uppercase
+    if isAminoAcid
+        intseq1 = aa2int(seq1);
+    else
+        intseq1 = nt2int(seq1);
+    end
+else
+    intseq1=seq1;
+    if isAminoAcid
+        seq1 = int2aa(intseq1);
+    else
+        seq1 = int2nt(intseq1);
+    end
+end
+if ischar(seq2)
+    seq2=upper(seq2); %the output alignment will be all uppercase
+    if isAminoAcid
+        intseq2 = aa2int(seq2);
+    else
+        intseq2 = nt2int(seq2);
+    end
+else
+    intseq2=seq2;
+    if isAminoAcid
+        seq2 = int2aa(intseq2);
+    else
+        seq2 = int2nt(intseq2);
+    end
+end
+
+
+m = length(seq1);
+n = length(seq2);
+if ~n||~m
+    error('Bioinfo:InvalidLengthSequences','Length of input sequences must be greater than 0');
+end
+
+% If unknown, ambiguous or gaps appear in the sequence, we need to make
+% sure that ScoringMatrix can handle them.
+
+% possible values are
+% B  Z  X  *  -  ?
+% 21 22 23 24 25 26
+
+scoringMatrixSize = size(ScoringMatrix,1);
+
+highestVal = max([intseq1, intseq2]);
+if highestVal > scoringMatrixSize
+    % if the matrix contains the 'Any' we map to that
+    if isAminoAcid
+        anyVal = aa2int('X');
+    else
+        anyVal = nt2int('N');
+    end
+    if scoringMatrixSize >= anyVal
+        intseq1(intseq1>scoringMatrixSize) = anyVal;
+        intseq2(intseq2>scoringMatrixSize) = anyVal;
+    else
+        error('Bioinfo:InvalidSymbolsInInputSequeces',...
+            'Sequences contain symbols that cannot be handled by the given scoring matrix.');
+    end
+end
+
+if setGapExtend  % call more complicated algorithm if we have
+    [F, pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend);
+else
+%   [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen);
+    scoredMatchMat=[ScoringMatrix(intseq2,intseq1(:)) zeros(n,1); zeros(1,m+1)];
+    [F pointer]=FastNWalign2(scoredMatchMat,gapopen);
+end
+
+% trace back through the pointer matrix
+halfn=ceil((n+1)/2);
+halfm=ceil((m+1)/2);
+
+i = n+1; j = m+1;
+path = zeros(n+m,2);
+step = 1;
+
+score =  max(F(n+1,m+1,:));
+
+if setGapExtend
+
+    if F(n+1,m+1,3)==score      % favor with left-gap
+        laststate=3;
+    elseif F(n+1,m+1,2)==score  % then with up-gap
+        laststate=2;
+    else                        % at last with match
+        laststate=1;
+    end
+
+    while (i>halfn && j>halfm) % in the rigth half favor gaps when several
+        % paths lead to the highest score
+        state=laststate;
+        if bitget(pointer(i,j,state),3)
+            laststate=3;
+        elseif bitget(pointer(i,j,state),2)
+            laststate=2;
+        else
+            laststate=1;
+        end
+
+        switch state
+            case 1 % is diagonal
+                j = j - 1;
+                i = i - 1;
+                path(step,:) = [j,i];
+            case 2 % is up
+                i = i - 1;
+                path(step,2) = i;
+            case 3 % is left
+                j = j - 1;
+                path(step,1) = j;
+        end
+        step = step +1;
+    end
+
+    while (i>1 || j>1)    % in the rigth half favor matchs when several
+        % paths lead to the highest score
+        state=laststate;
+        if bitget(pointer(i,j,state),1)
+            laststate=1;
+        elseif bitget(pointer(i,j,state),2)
+            laststate=2;
+        else
+            laststate=3;
+        end
+
+        switch state
+            case 1 % is diagonal
+                j = j - 1;
+                i = i - 1;
+                path(step,:) = [j,i];
+            case 2 % is up
+                i = i - 1;
+                path(step,2) = i;
+            case 3 % is left
+                j = j - 1;
+                path(step,1) = j;
+        end
+        step = step +1;
+    end
+
+else % ~setGapExtend
+
+    while (i > 1 || j > 1)
+
+        switch pointer(i,j)
+            case 1 % diagonal only
+                j = j - 1;
+                i = i - 1;
+                path(step,:) = [j,i];
+            case 2 % up only
+                i = i - 1;
+                path(step,2) = i;
+            case 4 % left only
+                j = j - 1;
+                path(step,1) = j;
+            case 6 % up or left --> up (favors gaps in seq2)
+                j = j - 1;
+                path(step,1) = j;
+            otherwise %3 diagonal or up   --> diagonal (favors no gaps)
+                %4 diagonal or left       --> diagonal (favors no gaps)
+                %7 diagonal or left or up --> diagonal (favors no gaps)
+                j = j - 1;
+                i = i - 1;
+                path(step,:) = [j,i];
+        end
+        step = step +1;
+    end
+
+end % if setGapExtend
+
+% re-scaling the output score
+score = scale * score;
+
+if nargout<=1 && ~showscore
+    return
+end
+
+path(step:end,:) = []; % step-1 is the length of the alignment
+path = flipud(path);
+
+% setting the size of the alignment
+alignment = repmat(('- -')',1,step-1);
+
+% adding sequence to alignment
+alignment(1,path(:,1)>0) = seq1;
+alignment(3,path(:,2)>0) = seq2;
+
+% find locations where there are no gaps
+h=find(all(path>0,2));
+if isAminoAcid
+    noGaps1=aa2int(alignment(1,h));
+    noGaps2=aa2int(alignment(3,h));
+else
+    noGaps1=nt2int(alignment(1,h));
+    noGaps2=nt2int(alignment(3,h));
+end
+
+% erasing symbols that can not be scored
+htodel=max([noGaps1;noGaps2])>scoringMatrixSize;
+h(htodel)=[];
+noGaps1(htodel)=[];
+noGaps2(htodel)=[];
+% score pairs with no gap
+value = ScoringMatrix(sub2ind(size(ScoringMatrix),double(noGaps1),double(noGaps2)));
+
+% insert symbols of the match string into the alignment
+alignment(2,h(value>=0)) = ':';
+alignment(2,h(noGaps1==noGaps2)) = '|';
+
+startat = [1;1];
+
+% undocumented fourth output -- score and pointer matrices
+if nargout > 3
+    matrices.Scores = F;
+    matrices.Pointers = pointer;
+end
+
+if showscore
+    figure
+    F=scale.*max(F(2:end,2:end,:),[],3);
+    clim=max(max(max(abs(F(~isinf(F))))),eps);
+    imagesc(F,[-clim clim]);
+    colormap(privateColorMap(1));
+    set(colorbar,'YLim',[min([F(:);-eps]) max([F(:);eps])])
+    title('Score for best path')
+    xlabel('Sequence 1')
+    ylabel('Sequence 2')
+    hold on
+    plot(path(all(path>0,2),1),path(all(path>0,2),2),'k.')
+end
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%     function [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gap)
+%         % Standard Needleman-Wunsch algorithm
+% 
+%         %LOOP IMPLEMENTED IN FastNWalign2.c
+%         
+%         % set up storage for dynamic programming matrix
+%         F = zeros(n+1,m+1);
+%         F(2:end,1) = gap * (1:n)';
+%         F(1,2:end) = gap * (1:m);
+% 
+% 
+%         %and for the back tracing matrix
+%         pointer= repmat(uint8(4),n+1,m+1);
+%         pointer(:,1) = 2;  % up
+%         pointer(1,1) = 1;
+% 
+% 
+%         %initialize buffers to the first column
+%         ptr = pointer(:,2); % ptr(1) is always 4
+%         currentFColumn = F(:,1);
+% 
+% 
+%         %main loop runs through the matrix looking for maximal scores
+% 
+%         for outer = 2:m+1
+% 
+%             %score current column
+%             scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
+%             %grab the data from the matrices and initialize some values
+%             lastFColumn    = F(:,outer-1);
+%             currentFColumn = F(:,outer);
+%             best = currentFColumn(1);
+% 
+%             %[F(:,outer) pointer(:,outer)]=FastNWalign(lastFColumn,currentFColumn,scoredMatchColumn,gap);
+% 
+%             for inner = 2:n+1
+%                 % score the three options
+%                 up       = best + gap;
+%                 left     = lastFColumn(inner) + gap;
+%                 diagonal = lastFColumn(inner-1) + scoredMatchColumn(inner-1);
+% 
+%                 % max could be used here but it is quicker to use if statements
+%                 if up > left
+%                     best = up;
+%                     pos = 2;
+%                 else
+%                     best = left;
+%                     pos = 4;
+%                 end
+% 
+%                 if diagonal >= best
+%                     best = diagonal;
+%                     ptr(inner) = 1;
+%                 else
+%                     ptr(inner) = pos;
+%                 end
+%                 currentFColumn(inner) = best;
+% 
+%             end % inner
+%             %put back updated columns
+% 
+%             % save columns of pointers
+%             F(:,outer)   = currentFColumn;
+%             % save columns of pointers
+%             pointer(:,outer)  = ptr;
+% 
+%         end % outer
+% 
+%     end
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    function [F,pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend)
+        % Needleman-Wunsch algorithm modified to handle affine gaps
+
+        % Set states
+        inAlign =   1;
+        inGapUp =   2;
+        inGapLeft = 3;
+        numStates = 3;
+
+        % Set up storage for dynamic programming matrix:
+        % for keeping the maximum scores for every state
+
+        F =  zeros(n+1,m+1,numStates);
+        F(:,1,:) = -inf;
+        F(1,:,:) = -inf;
+        F(1,1,inAlign) = 0;
+
+        F(2:end,1,inGapUp)   = gapopen + gapextend * (0:n-1)';
+        F(1,2:end,inGapLeft) = gapopen + gapextend * (0:m-1);
+
+        % and for the back tracing pointers
+        pointer(n+1,m+1,numStates) = uint8(0);
+        pointer(2:end,1,inGapUp)   = 2;  % up
+        pointer(1,2:end,inGapLeft) = 4;  % left
+
+        % initialize buffers to the first column
+        ptrA = pointer(:,1,inAlign);
+        ptrU = pointer(:,1,inGapLeft);
+        ptrL = pointer(:,1,inGapUp);
+
+        currentFColumnA = F(:,1,inAlign);
+        currentFColumnU = F(:,1,inGapUp);
+        currentFColumnL = F(:,1,inGapLeft);
+
+        % main loop runs through the matrix looking for maximal scores
+        for outer = 2:m+1
+            % score current column
+            scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
+            % grab the data from the matrices and initialize some values for the
+            % first row the most orderly possible
+            lastFColumnA    = currentFColumnA;
+            currentFColumnA = F(:,outer,inAlign);
+            bestA           = currentFColumnA(1);
+            currentinA      = lastFColumnA(1);
+
+            lastFColumnU    = currentFColumnU;
+            currentFColumnU = F(:,outer,inGapUp);
+            bestU           = currentFColumnU(1);
+
+            lastFColumnL    = currentFColumnL;
+            currentFColumnL = F(:,outer,inGapLeft);
+            currentinGL     = lastFColumnL(1);
+
+            for inner = 2:n+1
+
+                % grab the data from the columns the most orderly possible
+                upOpen      = bestA + gapopen;
+                inA         = currentinA;
+                currentinA  = lastFColumnA(inner);
+                leftOpen    = currentinA + gapopen;
+
+                inGL        = currentinGL;
+                currentinGL = lastFColumnL(inner);
+                leftExtend  = currentinGL + gapextend;
+
+                upExtend = bestU + gapextend;
+                inGU     = lastFColumnU(inner-1);
+
+                % operate state 'inGapUp'
+
+                if upOpen > upExtend
+                    bestU = upOpen; ptr = 1;   % diagonal
+                elseif upOpen < upExtend
+                    bestU = upExtend; ptr = 2; % up
+                else % upOpen == upExtend
+                    bestU = upOpen; ptr = 3;   % diagonal and up
+                end
+                currentFColumnU(inner)=bestU;
+                ptrU(inner)=ptr;
+
+                % operate state 'inGapLeft'
+
+                if leftOpen > leftExtend
+                    bestL = leftOpen; ptr = 1;   % diagonal
+                elseif leftOpen < leftExtend
+                    bestL = leftExtend; ptr = 4; % left
+                else % leftOpen == leftExtend
+                    bestL = leftOpen; ptr = 5;   % diagonal and left
+                end
+                currentFColumnL(inner) = bestL;
+                ptrL(inner) = ptr;
+
+                % operate state 'inAlign'
+
+                if  inA > inGU
+                    if inA > inGL
+                        bestA = inA; ptr = 1;  % diagonal
+                    elseif inGL > inA
+                        bestA = inGL; ptr = 4; % left
+                    else
+                        bestA = inA; ptr = 5;  % diagonal and left
+                    end
+                elseif inGU > inA
+                    if inGU > inGL
+                        bestA = inGU; ptr = 2; % up
+                    elseif inGL > inGU
+                        bestA = inGL; ptr = 4; % left
+                    else
+                        bestA = inGU; ptr = 6; % up & left
+                    end
+                else
+                    if inA > inGL
+                        bestA = inA; ptr = 3;  % diagonal & up
+                    elseif inGL > inA
+                        bestA = inGL; ptr = 4; % left
+                    else
+                        bestA = inA; ptr = 7;  % all
+                    end
+                end
+
+                bestA = bestA + scoredMatchColumn(inner-1);
+                currentFColumnA(inner) = bestA;
+                ptrA(inner) = ptr;
+
+            end %inner
+
+            % put back updated columns
+            F(:,outer,inGapLeft) = currentFColumnL;
+            F(:,outer,inGapUp)   = currentFColumnU;
+            F(:,outer,inAlign)   = currentFColumnA;
+            % save columns of pointers
+            pointer(:,outer,inAlign)  = ptrA;
+            pointer(:,outer,inGapUp)  = ptrU;
+            pointer(:,outer,inGapLeft)= ptrL;
+        end %outer
+    end
+    function pcmap = privateColorMap(selection)
+        %PRIVATECOLORMAP returns a custom color map
+        switch selection
+            case 1, pts = [0 0 .3 20;
+                    0 .1 .8 25;
+                    0 .9 .5 15;
+                    .9 1 .9 8;
+                    1 1 0 26;
+                    1 0 0 26;
+                    .4 0 0 0];
+            otherwise, pts = [0 0 0 128; 1 1 1 0];
+        end
+        xcl=1;
+        for i=1:size(pts,1)-1
+            xcl=[xcl,i+1/pts(i,4):1/pts(i,4):i+1];
+        end
+        pcmap = interp1(pts(:,1:3),xcl);
+    end
+end
+