Will Dampier's Matlab code for analyzing the entropy of the collection by sequence position and time.

author: Don Pellegrino <don@drexel.edu> 2010-01-08 17:20:16 (GMT)
committer: Don Pellegrino <don@drexel.edu> 2010-01-08 17:20:16 (GMT)
commit: 170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf (patch) (unidiff)
tree: bcfea699462d48906f11dfc5fb4d36725bc12710
parent: 15b4e1318130d4c49109746efecc1018dce54da9 (diff)
download: exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.zip
exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.tar.gz
exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.tar.bz2
9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/Align2Ref.m b/test/entropy/Align2Ref.m
new file mode 100644
index 0000000..3021d4a
--- a/dev/null
+++ b/test/entropy/Align2Ref.m
@@ -0,0 +1,24 @@
+function NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS, ALPHA)
+%   Align2Ref
+%       Uses a profile alignment to change a set of indicices in the
+%       "alignment space" to the "reference space".
+%
+%
+%   NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS)
+%
+if nargin == 3
+    ALPHA = 'aa';
+end
+ref_prof = seqprofile(REF_SEQ, 'alphabet', ALPHA);
+align_prof = seqprofile(ALIGN, 'alphabet', ALPHA);
+[~, ind1, ind2] = profalign(ref_prof, align_prof);
+ninds = ind2(ALIGN_INDS);
+mask = sum(bsxfun(@le, ind1(:)', ninds(:)),2);
+mask(mask == 0) = 1;
+NEW_INDS = reshape(ind1(mask), size(ALIGN_INDS));
+\ No newline at end of file
diff --git a/test/entropy/CalculateEntropy.m b/test/entropy/CalculateEntropy.m
new file mode 100644
index 0000000..18bfda5
--- a/dev/null
+++ b/test/entropy/CalculateEntropy.m
@@ -0,0 +1,15 @@
+function OUT_VEC = CalculateEntropy(ARRAY)
+%   CalculateEntropy
+%       Calculate the Informationational Entropy of each position in a
+%       multialignment.  In this function I remove all gap characters from
+%       each column of the alignment before doing the calculation.
+%
+%
+%
+num_array = aa2int(ARRAY);
+bins = histc(num_array, 0:21, 1);
+probs=bsxfun(@rdivide, bins(2:end-1,:),sum(bins(2:end-1,:)));
+OUT_VEC = -nansum(log(probs).*double(probs~=0).*probs);
+\ No newline at end of file
diff --git a/test/entropy/CalculateProteinEntropy.m b/test/entropy/CalculateProteinEntropy.m
new file mode 100644
index 0000000..41f06f5
--- a/dev/null
+++ b/test/entropy/CalculateProteinEntropy.m
@@ -0,0 +1,25 @@
+function OUT_DATA=CalculateProteinEntropy(Ref, ALIGN_CELL)
+OUT_DATA = cell(size(ALIGN_CELL));
+for i = 1:size(ALIGN_CELL,1)
+    
+    aligned_seqs = ALIGN_CELL{i};
+    
+    ref_seq = Ref.Sequence;
+    
+    ref_prof = seqprofile(ref_seq, 'alphabet', 'aa');
+    seqs_prof = seqprofile(aligned_seqs, 'alphabet', 'aa');
+    
+    [mp, H1, H2] = profalign(ref_prof, seqs_prof);
+    
+    merged_seqs = char(zeros(size(aligned_seqs,1)+1, size(mp,2)));
+    merged_seqs(2:end, H2) = aligned_seqs;
+    merged_seqs(1, H1) = ref_seq;
+    
+    entropy_vals = CalculateEntropy(merged_seqs);
+    
+    %only pull out the values that are in the reference positions
+    ref_entropy = entropy_vals(H1);
+    
+    OUT_DATA{i} = ref_entropy;
+end
diff --git a/test/entropy/FastNWalign2.c b/test/entropy/FastNWalign2.c
new file mode 100644
index 0000000..33b286d
--- a/dev/null
+++ b/test/entropy/FastNWalign2.c
@@ -0,0 +1,94 @@
+#include "mex.h"
+#include "matrix.h"
+void simplegap(double scoredMatchMat[],
+const double gap, int n, int m, double output_F[], double output_P[])
+{
+    // Standard Needleman-Wunsch algorithm
+    
+    double up, left, diagonal, best, pos;
+    
+    
+    int i,j;
+    
+    for(i=0;i<m;i++)
+    {
+        output_F[i]=i*gap;
+        output_P[i]=2;
+    }
+    
+    
+    for(j=0; j<n; j++)  //put initial values in
+    {
+        output_F[j*m]=j*gap;
+        output_P[j*m]=4;
+    }
+    output_P[0]=1;
+    
+   
+    for(j=1;j<n;j++) //cycle through columns
+    {
+         best=output_F[(j)*m];
+         for(i=1; i<m; i++) //cycle through the rows
+         {
+             up=best+gap;
+             left=output_F[i+(j-1)*m]+gap;
+             diagonal=output_F[i-1+(j-1)*m]+scoredMatchMat[i-1+(j-1)*m];
+             
+             if (up>left)
+             {
+                 best=up;
+                 pos=2;
+             }
+             else
+            {
+                best=left;
+                pos=4;
+            }
+            
+            if (diagonal>=best)
+            {
+                best=diagonal;
+                output_P[i+j*m]=1;
+            }
+            else
+            {
+                output_P[i+j*m]=pos;
+            }
+            output_F[i+j*m]=best;
+        }
+    }
+}
+void mexFunction(int nlhs, mxArray *plhs[],
+int nrhs, const mxArray *prhs[])
+{
+    
+    double *gap, *output_F, *output_P;
+    double *scoredMatchMat;
+    int n, m;
+    mwSize i;
+    
+    //double *F_col, *ptr_col;
+    m=mxGetM(prhs[0]);
+    n=mxGetN(prhs[0]);
+    gap=mxGetPr(prhs[1]);
+    
+    plhs[0]=mxCreateDoubleMatrix(m,n,mxREAL);
+    plhs[1]=mxCreateDoubleMatrix(m,n,mxREAL);
+    
+    output_F=mxGetPr(plhs[0]);
+    output_P=mxGetPr(plhs[1]);
+    
+    
+    scoredMatchMat=mxGetPr(prhs[0]);
+    //
+    
+    
+    simplegap(scoredMatchMat,*gap,n,m,output_F,output_P);
+    
+}
diff --git a/test/entropy/GenomeAlignments.m b/test/entropy/GenomeAlignments.m
new file mode 100644
index 0000000..be56570
--- a/dev/null
+++ b/test/entropy/GenomeAlignments.m
@@ -0,0 +1,31 @@
+function [ALIGN inds] = GenomeAlignments(IN_GENOMES, DIST_MAT)
+%   GenomeAlignments
+%       This will align all of the protein segments in the genome structure
+%       and return a cell-array.  Each cell is the alignment of one
+%       protein.  If the protein is missing then the distance is NaN.
+%
+[tree inds] = MakeTree(DIST_MAT);
+ALIGN = malign(tree, inds);
+    function align=malign(tree, inds)
+        seqs = arrayfun(@(x)(x.Sequence), IN_GENOMES(inds), 'uniformoutput', false);
+        align = multialign(seqs, tree);
+    end
+    function [tree_obj inds]=MakeTree(dist_mat)
+        %   MakeTree
+        %       Takes a distance matrix and returns a tree object and the indicies
+        %       to the rows that are in the tree.
+        %
+        
+        inds = find(~all(isnan(dist_mat)|isinf(dist_mat)|dist_mat==0));
+        dvec = squareform(dist_mat(inds, inds));
+        dvec(dvec <= 0) = rand(nnz(dvec <= 0),1)*min(dvec(dvec>0));
+        tree_obj = seqlinkage(dvec);
+        
+    end
+end
+\ No newline at end of file
diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m
new file mode 100644
index 0000000..544db0a
--- a/dev/null
+++ b/test/entropy/GenomePairwiseDist.m
@@ -0,0 +1,98 @@
+function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
+%   GenomePairwiseDist
+%       This will align all of the protein segments in the genome structure
+%       and return a cell-array.  Each cell is the alignment of one
+%       protein.  If the protein is missing then the distance is NaN.
+%
+if nargin < 1
+        ALIGN_NAME = 'saved_alignments';
+end
+try
+    display('Loading Previous Alignments')
+    d = load(ALIGN_NAME);
+    saved_dists = d.DIST_CELL;
+    saved_headers = d.header_names;
+catch %#ok<CTCH>
+    display('No previous alignments found')
+    saved_headers = {};
+    saved_dists = zeros(numel(IN_PROTS));
+end
+            
+NUM_LETTERS = 4000000;
+[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);
+display('Starting Alignments')
+dist_mat = NaN(size(IN_PROTS,1));
+[tf loc]=ismember(header_names, saved_headers);
+dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
+[I J] = find(triu(isnan(dist_mat)));
+inds = sub2ind(size(dist_mat), I, J);
+sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);
+%find all of the ones that are not worth aligning
+bad_mask = sizes(I) <= 1 | sizes(J) <= 1;
+dist_mat(inds(bad_mask)) = Inf;
+I(bad_mask) = [];
+J(bad_mask) = [];
+p_sizes = sizes(I)+sizes(J);
+counter = 1;
+display('Aligning...')
+tic
+while counter < length(I)
+    cum_sizes = cumsum(p_sizes(counter+1:end));
+    top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');
+    
+    I_spots = I(counter:top);
+    J_spots = J(counter:top);
+    need_align = cell(length(I_spots),2);
+    [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
+    [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
+    dist = zeros(size(I_spots));
+    
+        %parfor k = 1:size(need_align,1)
+    for k = 1:size(need_align,1)
+        %display([num2str(k) ' ' num2str(toc)])
+        try
+            dist(k)=nwalign_mod(need_align{k,:});
+        catch %#ok<CTCH>
+            dist(k) = Inf;
+        end
+    end
+    time = toc;
+    inds = sub2ind(size(dist_mat), I_spots,J_spots);
+    dist_mat(inds) = dist;
+    counter = top+1;
+    display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
+end
+DIST_CELL = dist_mat;
+save(ALIGN_NAME, 'DIST_CELL', 'header_names')
+DIST_OUT = dfun(DIST_CELL);
+function out_d = dfun(MAT)
+score12 = triu(MAT,1);
+self_d = diag(MAT);
+d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));
+out_d = triu(d,1) + triu(d,1)';
diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m
new file mode 100644
index 0000000..0e69275
--- a/dev/null
+++ b/test/entropy/RefineAlignments.m
@@ -0,0 +1,276 @@
+function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin)
+%   RefineAlignments
+%       Uses an interative algorithm to improve the multiple alignments of
+%       an input sequence.  It will iteratively remove a fraction of the
+%       sequences, compress any gaps, and then re-align.  This will be done
+%       for NUM_REPS.
+%
+%   Written by Will Dampier.  Contact at wnd22@drexel.edu
+%   Version 1.0 on 10/23/09
+%   
+%   [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN)
+%
+%   IN_ALIGN        A char-array representing a multiple alignment to be
+%                   refined.
+%
+%   OUT_ALIGN       The refined alignment.
+%   SCORE           The new alignment score.
+%
+%
+%   Optional Arguements:
+%
+%       'Alphabet'      Either 'AA' or 'NT' to indicate whether this is a
+%                       nucleotide or amino acid alignment.
+%
+%       'GapPenalty'    The penalty for gaps.  This is set at -8 by
+%                       default.
+%
+%       'DistMat'       The distance matrix to use for the alignment 
+%                       scoring.  Default is BLOSUM50 for 'AA' and NUC44
+%                       for 'NT'
+%
+%       'NumReps'       The number of refining iterations to do.
+%
+%       'NumTry'        The number of tries to perform at each iteration.
+%
+%       'RefineType'    Which type of refinement to perform.  This can
+%                       be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'.
+%
+%       'Display'       A boolean indicating whether to display the output
+%                       at each iteration.
+%
+%
+RM_FRAC = 0.1;
+NUM_TRIES = 10;
+NUM_REPS = 100;
+alpha = 'aa';
+dist_mat = [];
+gap_penalty = -8;
+TYPE = 'mixed';
+disp_flag = true;
+for i = 1:2:length(varargin)
+    switch lower(varargin{i})
+        
+        case 'alphabet'
+            if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1})
+                alpha = varargin{i+1};
+            else
+                error('RefineAlignments:BADALPHA', ...
+                    'Arguement to "alphabet" must be "NT" or "AA"')
+            end
+        case 'gappenalty'
+            if isnumeric(varargin{i+1}) && varargin{i+1} < 0
+                gap_penalty = varargin{i+1};
+            else
+                error('RefineAlignments:BADGAP', ...
+                    'Arguement to "GapPenalty" must be negative numeric')
+            end
+        case 'distmat'
+            dist_mat = varargin{i+1};
+            
+        case 'numreps'
+            if isnumeric(varargin{i+1}) && varargin{i+1} > 1
+                NUM_REPS = varargin{i+1};
+            else
+                error('RefineAlignments:BADREPS', ...
+                        'Arguement to NumReps must be a positive numeric')
+            end
+            
+        case 'refinetype'
+            TYPE = varargin{i+1};
+            
+        case 'display'
+            if islogical(varargin{i+1})
+                disp_flag = varargin{i+1};
+            else
+                error('RefineAlignments:DISPLAY', ...
+                        'Arguement to Display must be a boolean.')
+            end
+       
+        otherwise
+            error('RefineAlignments:BADARG', 'Unknown arguement: %s', ...
+                    lower(varargin{i}))
+        
+    end
+end
+if isempty(dist_mat)
+    if strcmpi(alpha, 'nt')
+        dist_mat = nuc44;
+        toint = @nt2int;
+    else
+        dist_mat = blosum50;
+        toint = @aa2int;
+    end
+end
+dist_mat(end+1,:) = gap_penalty;
+dist_mat(:,end+1) = gap_penalty;
+dist_mat(end,end) = 0;
+switch upper(TYPE)
+    case 'MIXED'
+        is_stoch = true;
+        is_mix = true;
+    case 'STOCASTIC'
+        is_stoch = true;
+        is_mix = false;
+    case 'DETERMINISTIC'
+        is_stoch = false;
+        is_mix = false;
+end
+OUT_ALIGN = IN_ALIGN;
+cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint);
+for i = 1:NUM_REPS
+    
+    %determine whether to switch based on the "mix" factor
+    if is_mix
+        is_stoch = ~is_stoch;
+    end
+    
+    
+    rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES);
+    counter = 1;
+    while counter < NUM_TRIES
+        new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha);
+        
+        new_score = CalculateScore(new_mat, dist_mat, toint);
+        if new_score > cur_score
+            if disp_flag
+                disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ...
+                    num2str(abs(new_score-cur_score)), ' and by ', ...
+                    num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ...
+                    ' columns', ' Using stoch:', num2str(is_stoch)};
+                display([disp_cell{:}])
+            end
+            cur_score = new_score;
+            OUT_ALIGN = new_mat;
+            break
+        end
+        counter = counter + 1;
+    end
+end
+function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES)
+%   GetRows
+%       A helper function which determines the rows to split the alignment
+%       with.
+%
+%   MAT         A char-array of the multiple alignment.
+%   STOCH       A boolean indicating whether to use a stochastic approach.
+%   RM_FRAC     The fraction of rows to remove.
+%   NUM_TRIES   The number of tries to return.
+%
+%   rows_mat    A matrix indicating which rows to remove.
+%
+if STOCH
+    rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC;
+    if any(all(rows_mat,1)) || any(all(~rows_mat,1))
+        cols = any(all(rows_mat,1)) || any(all(~rows_mat,1));
+        rows_mat(1,cols) = ~rows_mat(1,cols);
+    end
+        
+else
+    gap_mask = MAT =='-';
+    runs = FindRuns(gap_mask);
+    score = mode(runs.*(runs~=0))-min(runs);
+    [~, order] = sort(score, 'descend');
+    rm_rows = order(1:NUM_TRIES);
+    [~, norder] = sort(runs(:,rm_rows),'descend');
+    rows_mat = false(size(MAT,1), NUM_TRIES);
+    num_remove = ceil(RM_FRAC*size(MAT,1));
+    for i = 1:NUM_TRIES
+        rows_mat(norder(1:num_remove,i),i) = true;
+    end
+end
+function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha)
+%   SplitAndReAlign
+%       This will split the alignment matix based on the provided rows and
+%       then create two profiles and re-align them.
+%
+%   MAT         A char-array of the alignment.
+%   ROWS        A boolean-array describing the switch between rows
+%   alpha       Either 'NT' or 'AA' to indicate which sequence profile to
+%               create.
+%
+%   NEW_MAT     The ReAlinged alignment
+%
+mat1 = MAT(ROWS, :);
+mat2 = MAT(~ROWS, :);
+mat1(:,all(mat1 == '-',1)) = [];
+mat2(:,all(mat2 == '-',1)) = [];
+pmat1 = seqprofile(mat1, 'alphabet', alpha);
+pmat2 = seqprofile(mat2, 'alphabet', alpha);
+try
+    [~, ind1, ind2] = profalign(pmat1, pmat2);
+catch
+    ind1 = 1:size(pmat1,2);
+    ind2 = 1:size(pmat2,2);
+end
+    
+NEW_MAT = char(0);
+NEW_MAT(ROWS, ind1) = mat1;
+NEW_MAT(~ROWS, ind2) = mat2;
+NEW_MAT(~isletter(NEW_MAT)) = '-';
+NEW_MAT(all(NEW_MAT == '-',1)) = [];
+function s = CalculateScore(MAT, DIST, FUN)
+%   CalculateScore
+%       A function which calculates the score of the alignment.  This score
+%       is based on the agreement between the consensus and each individual
+%       alignment.
+%
+%   MAT         A char-array of a multiple alignment.
+%   DIST        A distance matrix which indicates the penalty for
+%               mismatches.
+%   FUN         A function which converts letter to numbers such at nt2int
+%               or aa2int.
+%   
+%   s           The score for this alingment.
+%
+%
+cons = seqconsensus(MAT);
+num_cons = FUN(cons);
+num_align = FUN(MAT);
+ncons = repmat(num_cons(:), [size(num_align,1), 1]);
+nalign = num_align(:);
+lookupinds = sub2ind(size(DIST), ncons, nalign);
+s = sum(DIST(lookupinds));
+function reverse_looking = FindRuns(input)
+%   FindRuns
+%       Finds consecutive runs of 1's along the rows of a boolean array.
+%
+%   INPUT = [1 1 1 0 0 0 1 0 1 0 1 1;
+%            0 1 1 1 0 1 1 0 0 1 1 1];
+%   RL =    [1 2 3 0 0 0 1 0 1 0 1 2;
+%           [0 3 2 1 0 1 2 0 0 1 2 3];
+%
+[m,n] = size(input);
+reverse_looking = [zeros(1,m);input.'];
+reverse_looking = reverse_looking(:);
+p = find(~reverse_looking);
+reverse_looking(p) = [0;1-diff(p)];
+reverse_looking = reshape(cumsum(reverse_looking),[],m).';
+reverse_looking(:,1) = [];
diff --git a/test/entropy/don_anal.m b/test/entropy/don_anal.m
new file mode 100644
index 0000000..7d0a648
--- a/dev/null
+++ b/test/entropy/don_anal.m
@@ -0,0 +1,40 @@
+d = fastaread('C:\dondata\FASTA2.fa');
+lens = arrayfun(@(x)(numel(x.Sequence)), d);
+d(lens < 300) = [];
+rexp = '/Human/(\w*)/([\d\w]*)/(.*?)/(\d{4})/';
+headers = cell(length(d), 1);
+[headers{:}] = deal(d(:).Header);
+groups = regexp(headers, rexp, 'tokens', 'once');
+cgroups = cat(1,groups{:});
+dates = unique(cgroups(:,end));
+seqs = cell(length(d), 1);
+[seqs{:}] = deal(d(:).Sequence);
+ALIGN_CELL = cell(length(dates), 1);
+INDS_CELL = cell(length(dates), 1);
+PAIRWISE_CELL = cell(length(dates),1);
+for i = 1:length(dates)
+    display(dates{i})
+    tf = strcmp(cgroups(:,end), dates{i});
+    if isempty(PAIRWISE_CELL{i})
+        pairwise_dists = GenomePairwiseDist(d(tf), dates{i});
+        PAIRWISE_CELL{i} = pairwise_dists;
+    else
+        pairwise_dists = PAIRWISE_CELL{i};
+    end
+    if isempty(ALIGN_CELL{i}) || isempty(INDS_CELL{i})
+        [ALIGN_CELL{i}, INDS_CELL{i}] = GenomeAlignments(d(tf), ...
+                                        pairwise_dists);
+        save restore_data PAIRWISE_CELL ALIGN_CELL INDS_CELL
+    end
+end
+for i = 1:length(ALIGN_CELL)
+    [ALIGN_CELL{i}, ~]= RefineAlignments(ALIGN_CELL{i});
+end
+ents = CalculateProteinEntropy(d(1), ALIGN_CELL);
+dents = cat(1, ents{:});
diff --git a/test/entropy/nwalign_mod.m b/test/entropy/nwalign_mod.m
new file mode 100644
index 0000000..76aade6
--- a/dev/null
+++ b/test/entropy/nwalign_mod.m
@@ -0,0 +1,637 @@
+function [score, alignment, startat, matrices] = nwalign(seq1,seq2,varargin)
+%NWALIGN performs Needleman-Wunsch global alignment of two sequences.
+%
+%   NWALIGN(SEQ1, SEQ2) returns the score (in bits) for the optimal
+%   alignment. Note: The scale factor used to calculate the score is
+%   provided by the scoring matrix info (see below). If this is not
+%   defined, then NWALIGN returns the raw score.
+%
+%   [SCORE, ALIGNMENT] = NWALIGN(SEQ1, SEQ2) returns a string showing an
+%   optimal global alignment of amino acid (or nucleotide) sequences SEQ1
+%   and SEQ2.
+%
+%   [SCORE, ALIGNMENT, STARTAT] = NWALIGN(SEQ1, SEQ2)  returns a 2x1 vector
+%   with the starting point indices indicating the starting point of the
+%   alignment in the two sequences. Note: this output is for consistency
+%   with SWALIGN and will always be [1;1] because this is a global
+%   alignment.
+%
+%   NWALIGN(..., 'ALPHABET', A) specifies whether the sequences are
+%   amino acids ('AA') or nucleotides ('NT'). The default is AA.
+%
+%   NWALIGN(..., 'SCORINGMATRIX', matrix) defines the scoring matrix to be
+%   used for the alignment. The default is BLOSUM50 for AA or NUC44 for NT.
+%
+%   NWALIGN(..., 'SCALE' ,scale) indicates the scale factor of the scoring
+%   matrix to return the score using arbitrary units. If the scoring matrix
+%   Info also provides a scale factor, then both are used.
+%
+%   NWALIGN(..., 'GAPOPEN', penalty) defines the penalty for opening a gap
+%   in the alignment. The default gap open penalty is 8.
+%
+%   NWALIGN(..., 'EXTENDGAP', penalty) defines the penalty for extending a
+%   gap in the alignment. If EXTENDGAP is not specified, then extensions to
+%   gaps are scored with the same value as GAPOPEN.
+%
+%   NWALIGN(..., 'SHOWSCORE', true) displays the scoring space and the
+%   winning path.
+%
+%
+%   Examples:
+%
+%       % Return the score in bits and the global alignment using the
+%       % default scoring matrix (BLOSUM50).
+%       [score, align] = nwalign('VSPAGMASGYD', 'IPGKASYD')
+%
+%       % Use user-specified scoring matrix and "gap open" penalty.
+%       [score, align] = nwalign('IGRHRYHIGG', 'SRYIGRG',...
+%                               'scoringmatrix', @pam250, 'gapopen',5)
+%
+%       % Return the score in nat units (nats).
+%       [score, align] = nwalign('HEAGAWGHEE', 'PAWHEAE', 'scale', log(2))
+%
+%       % Display the scoring space and the winning path.
+%       nwalign('VSPAGMASGYD', 'IPGKASYD', 'showscore', true)
+%
+%   See also BLOSUM, MULTIALIGN, NT2AA, PAM, PROFALIGN, SEQDOTPLOT,
+%   SHOWALIGNMENT, SWALIGN.
+%   References:
+%   R. Durbin, S. Eddy, A. Krogh, and G. Mitchison. Biological Sequence
+%   Analysis. Cambridge UP, 1998.
+%   Needleman, S. B., Wunsch, C. D., J. Mol. Biol. (1970) 48:443-453
+%   Copyright 2002-2006 The MathWorks, Inc.
+%   $Revision: 1.22.6.10 $  $Date: 2006/05/17 20:48:28 $
+gapopen = -8;
+gapextend = -8;
+setGapExtend = false;
+showscore=false;
+isAminoAcid = true;
+scale=1;
+% If the input is a structure then extract the Sequence data.
+if isstruct(seq1)
+    try
+        seq1 = seqfromstruct(seq1);
+    catch
+        rethrow(lasterror);
+    end
+end
+if isstruct(seq2)
+    try
+        seq2 = seqfromstruct(seq2);
+    catch
+        rethrow(lasterror);
+    end
+end
+if nargin > 2
+    if rem(nargin,2) == 1
+        error('Bioinfo:IncorrectNumberOfArguments',...
+            'Incorrect number of arguments to %s.',mfilename);
+    end
+    okargs = {'scoringmatrix','gapopen','extendgap','alphabet','scale','showscore'};
+    for j=1:2:nargin-2
+        pname = varargin{j};
+        pval = varargin{j+1};
+        k = strmatch(lower(pname), okargs); %#ok
+        if isempty(k)
+            error('Bioinfo:UnknownParameterName',...
+                'Unknown parameter name: %s.',pname);
+        elseif length(k)>1
+            error('Bioinfo:AmbiguousParameterName',...
+                'Ambiguous parameter name: %s.',pname);
+        else
+            switch(k)
+                case 1  % scoring matrix
+                    if isnumeric(pval)
+                        ScoringMatrix = pval;
+                    else
+                        if ischar(pval)
+                            pval = lower(pval);
+                        end
+                        try
+                            [ScoringMatrix,ScoringMatrixInfo] = feval(pval);
+                        catch
+                            error('Bioinfo:InvalidScoringMatrix','Invalid scoring matrix.');
+                        end
+                    end
+                case 2 %gap open penalty
+                    gapopen = -pval;
+                case 3 %gap extend penalty
+                    gapextend = -pval;
+                    setGapExtend = true;
+                case 4 %if sequence is nucleotide
+                    if strcmpi(pval,'nt')
+                        isAminoAcid = false;
+                    end
+                case 5 % scale
+                    scale=pval;
+                case 6 % showscore
+                    showscore = pval == true;
+            end
+        end
+    end
+end
+% setting the default scoring matrix
+if ~exist('ScoringMatrix','var')
+    if isAminoAcid
+        [ScoringMatrix,ScoringMatrixInfo] = blosum50;
+    else
+        [ScoringMatrix,ScoringMatrixInfo] = nuc44;
+    end
+end
+% getting the scale from ScoringMatrixInfo, if it exists
+if exist('ScoringMatrixInfo','var') && isfield(ScoringMatrixInfo,'Scale')
+    scale=scale*ScoringMatrixInfo.Scale;
+end
+% handle properly "?" characters typically found in pdb files
+if isAminoAcid
+    if ischar(seq1)
+        seq1 = strrep(seq1,'?','X');
+    else
+        seq1(seq1 == 26) = 23;
+    end
+    if ischar(seq2)
+        seq2 = strrep(seq2,'?','X');
+    else
+        seq2(seq2 == 26) = 23;
+    end
+end
+% check input sequences
+% if isAminoAcid && ~(isaa(seq1) && isaa(seq2))
+%     error('Bioinfo:InvalidAminoAcidSequences',...
+%         'Both sequences must be amino acids, use ALPHABET = ''NT'' for aligning nucleotides.');
+% elseif ~isAminoAcid && ~(isnt(seq1) && isnt(seq2))
+%     error('Bioinfo:InvalidNucleotideSequences',...
+%         'When ALPHABET = ''NT'', both sequences must be nucleotides.');
+% end
+% use numerical arrays for easy indexing
+if ischar(seq1)
+    seq1=upper(seq1); %the output alignment will be all uppercase
+    if isAminoAcid
+        intseq1 = aa2int(seq1);
+    else
+        intseq1 = nt2int(seq1);
+    end
+else
+    intseq1=seq1;
+    if isAminoAcid
+        seq1 = int2aa(intseq1);
+    else
+        seq1 = int2nt(intseq1);
+    end
+end
+if ischar(seq2)
+    seq2=upper(seq2); %the output alignment will be all uppercase
+    if isAminoAcid
+        intseq2 = aa2int(seq2);
+    else
+        intseq2 = nt2int(seq2);
+    end
+else
+    intseq2=seq2;
+    if isAminoAcid
+        seq2 = int2aa(intseq2);
+    else
+        seq2 = int2nt(intseq2);
+    end
+end
+m = length(seq1);
+n = length(seq2);
+if ~n||~m
+    error('Bioinfo:InvalidLengthSequences','Length of input sequences must be greater than 0');
+end
+% If unknown, ambiguous or gaps appear in the sequence, we need to make
+% sure that ScoringMatrix can handle them.
+% possible values are
+% B  Z  X  *  -  ?
+% 21 22 23 24 25 26
+scoringMatrixSize = size(ScoringMatrix,1);
+highestVal = max([intseq1, intseq2]);
+if highestVal > scoringMatrixSize
+    % if the matrix contains the 'Any' we map to that
+    if isAminoAcid
+        anyVal = aa2int('X');
+    else
+        anyVal = nt2int('N');
+    end
+    if scoringMatrixSize >= anyVal
+        intseq1(intseq1>scoringMatrixSize) = anyVal;
+        intseq2(intseq2>scoringMatrixSize) = anyVal;
+    else
+        error('Bioinfo:InvalidSymbolsInInputSequeces',...
+            'Sequences contain symbols that cannot be handled by the given scoring matrix.');
+    end
+end
+if setGapExtend  % call more complicated algorithm if we have
+    [F, pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend);
+else
+%   [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen);
+    scoredMatchMat=[ScoringMatrix(intseq2,intseq1(:)) zeros(n,1); zeros(1,m+1)];
+    [F pointer]=FastNWalign2(scoredMatchMat,gapopen);
+end
+% trace back through the pointer matrix
+halfn=ceil((n+1)/2);
+halfm=ceil((m+1)/2);
+i = n+1; j = m+1;
+path = zeros(n+m,2);
+step = 1;
+score =  max(F(n+1,m+1,:));
+if setGapExtend
+    if F(n+1,m+1,3)==score      % favor with left-gap
+        laststate=3;
+    elseif F(n+1,m+1,2)==score  % then with up-gap
+        laststate=2;
+    else                        % at last with match
+        laststate=1;
+    end
+    while (i>halfn && j>halfm) % in the rigth half favor gaps when several
+        % paths lead to the highest score
+        state=laststate;
+        if bitget(pointer(i,j,state),3)
+            laststate=3;
+        elseif bitget(pointer(i,j,state),2)
+            laststate=2;
+        else
+            laststate=1;
+        end
+        switch state
+            case 1 % is diagonal
+                j = j - 1;
+                i = i - 1;
+                path(step,:) = [j,i];
+            case 2 % is up
+                i = i - 1;
+                path(step,2) = i;
+            case 3 % is left
+                j = j - 1;
+                path(step,1) = j;
+        end
+        step = step +1;
+    end
+    while (i>1 || j>1)    % in the rigth half favor matchs when several
+        % paths lead to the highest score
+        state=laststate;
+        if bitget(pointer(i,j,state),1)
+            laststate=1;
+        elseif bitget(pointer(i,j,state),2)
+            laststate=2;
+        else
+            laststate=3;
+        end
+        switch state
+            case 1 % is diagonal
+                j = j - 1;
+                i = i - 1;
+                path(step,:) = [j,i];
+            case 2 % is up
+                i = i - 1;
+                path(step,2) = i;
+            case 3 % is left
+                j = j - 1;
+                path(step,1) = j;
+        end
+        step = step +1;
+    end
+else % ~setGapExtend
+    while (i > 1 || j > 1)
+        switch pointer(i,j)
+            case 1 % diagonal only
+                j = j - 1;
+                i = i - 1;
+                path(step,:) = [j,i];
+            case 2 % up only
+                i = i - 1;
+                path(step,2) = i;
+            case 4 % left only
+                j = j - 1;
+                path(step,1) = j;
+            case 6 % up or left --> up (favors gaps in seq2)
+                j = j - 1;
+                path(step,1) = j;
+            otherwise %3 diagonal or up   --> diagonal (favors no gaps)
+                %4 diagonal or left       --> diagonal (favors no gaps)
+                %7 diagonal or left or up --> diagonal (favors no gaps)
+                j = j - 1;
+                i = i - 1;
+                path(step,:) = [j,i];
+        end
+        step = step +1;
+    end
+end % if setGapExtend
+% re-scaling the output score
+score = scale * score;
+if nargout<=1 && ~showscore
+    return
+end
+path(step:end,:) = []; % step-1 is the length of the alignment
+path = flipud(path);
+% setting the size of the alignment
+alignment = repmat(('- -')',1,step-1);
+% adding sequence to alignment
+alignment(1,path(:,1)>0) = seq1;
+alignment(3,path(:,2)>0) = seq2;
+% find locations where there are no gaps
+h=find(all(path>0,2));
+if isAminoAcid
+    noGaps1=aa2int(alignment(1,h));
+    noGaps2=aa2int(alignment(3,h));
+else
+    noGaps1=nt2int(alignment(1,h));
+    noGaps2=nt2int(alignment(3,h));
+end
+% erasing symbols that can not be scored
+htodel=max([noGaps1;noGaps2])>scoringMatrixSize;
+h(htodel)=[];
+noGaps1(htodel)=[];
+noGaps2(htodel)=[];
+% score pairs with no gap
+value = ScoringMatrix(sub2ind(size(ScoringMatrix),double(noGaps1),double(noGaps2)));
+% insert symbols of the match string into the alignment
+alignment(2,h(value>=0)) = ':';
+alignment(2,h(noGaps1==noGaps2)) = '|';
+startat = [1;1];
+% undocumented fourth output -- score and pointer matrices
+if nargout > 3
+    matrices.Scores = F;
+    matrices.Pointers = pointer;
+end
+if showscore
+    figure
+    F=scale.*max(F(2:end,2:end,:),[],3);
+    clim=max(max(max(abs(F(~isinf(F))))),eps);
+    imagesc(F,[-clim clim]);
+    colormap(privateColorMap(1));
+    set(colorbar,'YLim',[min([F(:);-eps]) max([F(:);eps])])
+    title('Score for best path')
+    xlabel('Sequence 1')
+    ylabel('Sequence 2')
+    hold on
+    plot(path(all(path>0,2),1),path(all(path>0,2),2),'k.')
+end
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%     function [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gap)
+%         % Standard Needleman-Wunsch algorithm
+% 
+%         %LOOP IMPLEMENTED IN FastNWalign2.c
+%         
+%         % set up storage for dynamic programming matrix
+%         F = zeros(n+1,m+1);
+%         F(2:end,1) = gap * (1:n)';
+%         F(1,2:end) = gap * (1:m);
+% 
+% 
+%         %and for the back tracing matrix
+%         pointer= repmat(uint8(4),n+1,m+1);
+%         pointer(:,1) = 2;  % up
+%         pointer(1,1) = 1;
+% 
+% 
+%         %initialize buffers to the first column
+%         ptr = pointer(:,2); % ptr(1) is always 4
+%         currentFColumn = F(:,1);
+% 
+% 
+%         %main loop runs through the matrix looking for maximal scores
+% 
+%         for outer = 2:m+1
+% 
+%             %score current column
+%             scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
+%             %grab the data from the matrices and initialize some values
+%             lastFColumn    = F(:,outer-1);
+%             currentFColumn = F(:,outer);
+%             best = currentFColumn(1);
+% 
+%             %[F(:,outer) pointer(:,outer)]=FastNWalign(lastFColumn,currentFColumn,scoredMatchColumn,gap);
+% 
+%             for inner = 2:n+1
+%                 % score the three options
+%                 up       = best + gap;
+%                 left     = lastFColumn(inner) + gap;
+%                 diagonal = lastFColumn(inner-1) + scoredMatchColumn(inner-1);
+% 
+%                 % max could be used here but it is quicker to use if statements
+%                 if up > left
+%                     best = up;
+%                     pos = 2;
+%                 else
+%                     best = left;
+%                     pos = 4;
+%                 end
+% 
+%                 if diagonal >= best
+%                     best = diagonal;
+%                     ptr(inner) = 1;
+%                 else
+%                     ptr(inner) = pos;
+%                 end
+%                 currentFColumn(inner) = best;
+% 
+%             end % inner
+%             %put back updated columns
+% 
+%             % save columns of pointers
+%             F(:,outer)   = currentFColumn;
+%             % save columns of pointers
+%             pointer(:,outer)  = ptr;
+% 
+%         end % outer
+% 
+%     end
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    function [F,pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend)
+        % Needleman-Wunsch algorithm modified to handle affine gaps
+        % Set states
+        inAlign =   1;
+        inGapUp =   2;
+        inGapLeft = 3;
+        numStates = 3;
+        % Set up storage for dynamic programming matrix:
+        % for keeping the maximum scores for every state
+        F =  zeros(n+1,m+1,numStates);
+        F(:,1,:) = -inf;
+        F(1,:,:) = -inf;
+        F(1,1,inAlign) = 0;
+        F(2:end,1,inGapUp)   = gapopen + gapextend * (0:n-1)';
+        F(1,2:end,inGapLeft) = gapopen + gapextend * (0:m-1);
+        % and for the back tracing pointers
+        pointer(n+1,m+1,numStates) = uint8(0);
+        pointer(2:end,1,inGapUp)   = 2;  % up
+        pointer(1,2:end,inGapLeft) = 4;  % left
+        % initialize buffers to the first column
+        ptrA = pointer(:,1,inAlign);
+        ptrU = pointer(:,1,inGapLeft);
+        ptrL = pointer(:,1,inGapUp);
+        currentFColumnA = F(:,1,inAlign);
+        currentFColumnU = F(:,1,inGapUp);
+        currentFColumnL = F(:,1,inGapLeft);
+        % main loop runs through the matrix looking for maximal scores
+        for outer = 2:m+1
+            % score current column
+            scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
+            % grab the data from the matrices and initialize some values for the
+            % first row the most orderly possible
+            lastFColumnA    = currentFColumnA;
+            currentFColumnA = F(:,outer,inAlign);
+            bestA           = currentFColumnA(1);
+            currentinA      = lastFColumnA(1);
+            lastFColumnU    = currentFColumnU;
+            currentFColumnU = F(:,outer,inGapUp);
+            bestU           = currentFColumnU(1);
+            lastFColumnL    = currentFColumnL;
+            currentFColumnL = F(:,outer,inGapLeft);
+            currentinGL     = lastFColumnL(1);
+            for inner = 2:n+1
+                % grab the data from the columns the most orderly possible
+                upOpen      = bestA + gapopen;
+                inA         = currentinA;
+                currentinA  = lastFColumnA(inner);
+                leftOpen    = currentinA + gapopen;
+                inGL        = currentinGL;
+                currentinGL = lastFColumnL(inner);
+                leftExtend  = currentinGL + gapextend;
+                upExtend = bestU + gapextend;
+                inGU     = lastFColumnU(inner-1);
+                % operate state 'inGapUp'
+                if upOpen > upExtend
+                    bestU = upOpen; ptr = 1;   % diagonal
+                elseif upOpen < upExtend
+                    bestU = upExtend; ptr = 2; % up
+                else % upOpen == upExtend
+                    bestU = upOpen; ptr = 3;   % diagonal and up
+                end
+                currentFColumnU(inner)=bestU;
+                ptrU(inner)=ptr;
+                % operate state 'inGapLeft'
+                if leftOpen > leftExtend
+                    bestL = leftOpen; ptr = 1;   % diagonal
+                elseif leftOpen < leftExtend
+                    bestL = leftExtend; ptr = 4; % left
+                else % leftOpen == leftExtend
+                    bestL = leftOpen; ptr = 5;   % diagonal and left
+                end
+                currentFColumnL(inner) = bestL;
+                ptrL(inner) = ptr;
+                % operate state 'inAlign'
+                if  inA > inGU
+                    if inA > inGL
+                        bestA = inA; ptr = 1;  % diagonal
+                    elseif inGL > inA
+                        bestA = inGL; ptr = 4; % left
+                    else
+                        bestA = inA; ptr = 5;  % diagonal and left
+                    end
+                elseif inGU > inA
+                    if inGU > inGL
+                        bestA = inGU; ptr = 2; % up
+                    elseif inGL > inGU
+                        bestA = inGL; ptr = 4; % left
+                    else
+                        bestA = inGU; ptr = 6; % up & left
+                    end
+                else
+                    if inA > inGL
+                        bestA = inA; ptr = 3;  % diagonal & up
+                    elseif inGL > inA
+                        bestA = inGL; ptr = 4; % left
+                    else
+                        bestA = inA; ptr = 7;  % all
+                    end
+                end
+                bestA = bestA + scoredMatchColumn(inner-1);
+                currentFColumnA(inner) = bestA;
+                ptrA(inner) = ptr;
+            end %inner
+            % put back updated columns
+            F(:,outer,inGapLeft) = currentFColumnL;
+            F(:,outer,inGapUp)   = currentFColumnU;
+            F(:,outer,inAlign)   = currentFColumnA;
+            % save columns of pointers
+            pointer(:,outer,inAlign)  = ptrA;
+            pointer(:,outer,inGapUp)  = ptrU;
+            pointer(:,outer,inGapLeft)= ptrL;
+        end %outer
+    end
+    function pcmap = privateColorMap(selection)
+        %PRIVATECOLORMAP returns a custom color map
+        switch selection
+            case 1, pts = [0 0 .3 20;
+                    0 .1 .8 25;
+                    0 .9 .5 15;
+                    .9 1 .9 8;
+                    1 1 0 26;
+                    1 0 0 26;
+                    .4 0 0 0];
+            otherwise, pts = [0 0 0 128; 1 1 1 0];
+        end
+        xcl=1;
+        for i=1:size(pts,1)-1
+            xcl=[xcl,i+1/pts(i,4):1/pts(i,4):i+1];
+        end
+        pcmap = interp1(pts(:,1:3),xcl);
+    end
+end
author	Don Pellegrino <don@drexel.edu>	2010-01-08 17:20:16 (GMT)
committer	Don Pellegrino <don@drexel.edu>	2010-01-08 17:20:16 (GMT)
commit	170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf (patch) (unidiff)
tree	bcfea699462d48906f11dfc5fb4d36725bc12710
parent	15b4e1318130d4c49109746efecc1018dce54da9 (diff)
download	exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.zip exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.tar.gz exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.tar.bz2