From 170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf Mon Sep 17 00:00:00 2001 From: Don Pellegrino Date: Fri, 08 Jan 2010 17:20:16 +0000 Subject: Will Dampier's Matlab code for analyzing the entropy of the collection by sequence position and time. --- diff --git a/test/entropy/Align2Ref.m b/test/entropy/Align2Ref.m new file mode 100644 index 0000000..3021d4a --- a/dev/null +++ b/test/entropy/Align2Ref.m @@ -0,0 +1,24 @@ +function NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS, ALPHA) +% Align2Ref +% Uses a profile alignment to change a set of indicices in the +% "alignment space" to the "reference space". +% +% +% NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS) +% + +if nargin == 3 + ALPHA = 'aa'; +end + +ref_prof = seqprofile(REF_SEQ, 'alphabet', ALPHA); +align_prof = seqprofile(ALIGN, 'alphabet', ALPHA); + +[~, ind1, ind2] = profalign(ref_prof, align_prof); + +ninds = ind2(ALIGN_INDS); + +mask = sum(bsxfun(@le, ind1(:)', ninds(:)),2); +mask(mask == 0) = 1; + +NEW_INDS = reshape(ind1(mask), size(ALIGN_INDS)); \ No newline at end of file diff --git a/test/entropy/CalculateEntropy.m b/test/entropy/CalculateEntropy.m new file mode 100644 index 0000000..18bfda5 --- a/dev/null +++ b/test/entropy/CalculateEntropy.m @@ -0,0 +1,15 @@ +function OUT_VEC = CalculateEntropy(ARRAY) +% CalculateEntropy +% Calculate the Informationational Entropy of each position in a +% multialignment. In this function I remove all gap characters from +% each column of the alignment before doing the calculation. +% +% +% + +num_array = aa2int(ARRAY); +bins = histc(num_array, 0:21, 1); + +probs=bsxfun(@rdivide, bins(2:end-1,:),sum(bins(2:end-1,:))); + +OUT_VEC = -nansum(log(probs).*double(probs~=0).*probs); \ No newline at end of file diff --git a/test/entropy/CalculateProteinEntropy.m b/test/entropy/CalculateProteinEntropy.m new file mode 100644 index 0000000..41f06f5 --- a/dev/null +++ b/test/entropy/CalculateProteinEntropy.m @@ -0,0 +1,25 @@ +function OUT_DATA=CalculateProteinEntropy(Ref, ALIGN_CELL) + +OUT_DATA = cell(size(ALIGN_CELL)); +for i = 1:size(ALIGN_CELL,1) + + aligned_seqs = ALIGN_CELL{i}; + + ref_seq = Ref.Sequence; + + ref_prof = seqprofile(ref_seq, 'alphabet', 'aa'); + seqs_prof = seqprofile(aligned_seqs, 'alphabet', 'aa'); + + [mp, H1, H2] = profalign(ref_prof, seqs_prof); + + merged_seqs = char(zeros(size(aligned_seqs,1)+1, size(mp,2))); + merged_seqs(2:end, H2) = aligned_seqs; + merged_seqs(1, H1) = ref_seq; + + entropy_vals = CalculateEntropy(merged_seqs); + + %only pull out the values that are in the reference positions + ref_entropy = entropy_vals(H1); + + OUT_DATA{i} = ref_entropy; +end diff --git a/test/entropy/FastNWalign2.c b/test/entropy/FastNWalign2.c new file mode 100644 index 0000000..33b286d --- a/dev/null +++ b/test/entropy/FastNWalign2.c @@ -0,0 +1,94 @@ +#include "mex.h" +#include "matrix.h" + +void simplegap(double scoredMatchMat[], +const double gap, int n, int m, double output_F[], double output_P[]) +{ + // Standard Needleman-Wunsch algorithm + + double up, left, diagonal, best, pos; + + + int i,j; + + + for(i=0;ileft) + { + best=up; + pos=2; + } + else + { + best=left; + pos=4; + } + + if (diagonal>=best) + { + best=diagonal; + output_P[i+j*m]=1; + } + else + { + output_P[i+j*m]=pos; + } + output_F[i+j*m]=best; + } + } +} + +void mexFunction(int nlhs, mxArray *plhs[], +int nrhs, const mxArray *prhs[]) +{ + + double *gap, *output_F, *output_P; + double *scoredMatchMat; + int n, m; + mwSize i; + + //double *F_col, *ptr_col; + m=mxGetM(prhs[0]); + n=mxGetN(prhs[0]); + gap=mxGetPr(prhs[1]); + + plhs[0]=mxCreateDoubleMatrix(m,n,mxREAL); + plhs[1]=mxCreateDoubleMatrix(m,n,mxREAL); + + output_F=mxGetPr(plhs[0]); + output_P=mxGetPr(plhs[1]); + + + scoredMatchMat=mxGetPr(prhs[0]); + // + + + simplegap(scoredMatchMat,*gap,n,m,output_F,output_P); + +} + diff --git a/test/entropy/GenomeAlignments.m b/test/entropy/GenomeAlignments.m new file mode 100644 index 0000000..be56570 --- a/dev/null +++ b/test/entropy/GenomeAlignments.m @@ -0,0 +1,31 @@ +function [ALIGN inds] = GenomeAlignments(IN_GENOMES, DIST_MAT) +% GenomeAlignments +% This will align all of the protein segments in the genome structure +% and return a cell-array. Each cell is the alignment of one +% protein. If the protein is missing then the distance is NaN. +% + +[tree inds] = MakeTree(DIST_MAT); +ALIGN = malign(tree, inds); + + function align=malign(tree, inds) + seqs = arrayfun(@(x)(x.Sequence), IN_GENOMES(inds), 'uniformoutput', false); + align = multialign(seqs, tree); + end + + + + function [tree_obj inds]=MakeTree(dist_mat) + % MakeTree + % Takes a distance matrix and returns a tree object and the indicies + % to the rows that are in the tree. + % + + inds = find(~all(isnan(dist_mat)|isinf(dist_mat)|dist_mat==0)); + dvec = squareform(dist_mat(inds, inds)); + dvec(dvec <= 0) = rand(nnz(dvec <= 0),1)*min(dvec(dvec>0)); + tree_obj = seqlinkage(dvec); + + end + +end \ No newline at end of file diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m new file mode 100644 index 0000000..544db0a --- a/dev/null +++ b/test/entropy/GenomePairwiseDist.m @@ -0,0 +1,98 @@ +function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME) +% GenomePairwiseDist +% This will align all of the protein segments in the genome structure +% and return a cell-array. Each cell is the alignment of one +% protein. If the protein is missing then the distance is NaN. +% + +if nargin < 1 + ALIGN_NAME = 'saved_alignments'; +end + +try + display('Loading Previous Alignments') + d = load(ALIGN_NAME); + saved_dists = d.DIST_CELL; + saved_headers = d.header_names; +catch %#ok + display('No previous alignments found') + saved_headers = {}; + saved_dists = zeros(numel(IN_PROTS)); +end + +NUM_LETTERS = 4000000; + + +[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header); + +display('Starting Alignments') + +dist_mat = NaN(size(IN_PROTS,1)); + +[tf loc]=ismember(header_names, saved_headers); + +dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf)); +[I J] = find(triu(isnan(dist_mat))); +inds = sub2ind(size(dist_mat), I, J); + +sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS); + +%find all of the ones that are not worth aligning +bad_mask = sizes(I) <= 1 | sizes(J) <= 1; +dist_mat(inds(bad_mask)) = Inf; +I(bad_mask) = []; +J(bad_mask) = []; + +p_sizes = sizes(I)+sizes(J); +counter = 1; +display('Aligning...') +tic +while counter < length(I) + cum_sizes = cumsum(p_sizes(counter+1:end)); + top = counter+find(cum_sizes + dist(k) = Inf; + end + end + time = toc; + inds = sub2ind(size(dist_mat), I_spots,J_spots); + dist_mat(inds) = dist; + counter = top+1; + display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds']) +end + +DIST_CELL = dist_mat; +save(ALIGN_NAME, 'DIST_CELL', 'header_names') +DIST_OUT = dfun(DIST_CELL); + + + + + +function out_d = dfun(MAT) + +score12 = triu(MAT,1); +self_d = diag(MAT); +d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)')); + +out_d = triu(d,1) + triu(d,1)'; + + + + + + + diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m new file mode 100644 index 0000000..0e69275 --- a/dev/null +++ b/test/entropy/RefineAlignments.m @@ -0,0 +1,276 @@ +function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin) +% RefineAlignments +% Uses an interative algorithm to improve the multiple alignments of +% an input sequence. It will iteratively remove a fraction of the +% sequences, compress any gaps, and then re-align. This will be done +% for NUM_REPS. +% +% Written by Will Dampier. Contact at wnd22@drexel.edu +% Version 1.0 on 10/23/09 +% +% [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN) +% +% IN_ALIGN A char-array representing a multiple alignment to be +% refined. +% +% OUT_ALIGN The refined alignment. +% SCORE The new alignment score. +% +% +% Optional Arguements: +% +% 'Alphabet' Either 'AA' or 'NT' to indicate whether this is a +% nucleotide or amino acid alignment. +% +% 'GapPenalty' The penalty for gaps. This is set at -8 by +% default. +% +% 'DistMat' The distance matrix to use for the alignment +% scoring. Default is BLOSUM50 for 'AA' and NUC44 +% for 'NT' +% +% 'NumReps' The number of refining iterations to do. +% +% 'NumTry' The number of tries to perform at each iteration. +% +% 'RefineType' Which type of refinement to perform. This can +% be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'. +% +% 'Display' A boolean indicating whether to display the output +% at each iteration. +% +% + + + + +RM_FRAC = 0.1; +NUM_TRIES = 10; +NUM_REPS = 100; +alpha = 'aa'; +dist_mat = []; +gap_penalty = -8; +TYPE = 'mixed'; +disp_flag = true; + +for i = 1:2:length(varargin) + switch lower(varargin{i}) + + case 'alphabet' + if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1}) + alpha = varargin{i+1}; + else + error('RefineAlignments:BADALPHA', ... + 'Arguement to "alphabet" must be "NT" or "AA"') + end + case 'gappenalty' + if isnumeric(varargin{i+1}) && varargin{i+1} < 0 + gap_penalty = varargin{i+1}; + else + error('RefineAlignments:BADGAP', ... + 'Arguement to "GapPenalty" must be negative numeric') + end + case 'distmat' + dist_mat = varargin{i+1}; + + case 'numreps' + if isnumeric(varargin{i+1}) && varargin{i+1} > 1 + NUM_REPS = varargin{i+1}; + else + error('RefineAlignments:BADREPS', ... + 'Arguement to NumReps must be a positive numeric') + end + + case 'refinetype' + TYPE = varargin{i+1}; + + case 'display' + if islogical(varargin{i+1}) + disp_flag = varargin{i+1}; + else + error('RefineAlignments:DISPLAY', ... + 'Arguement to Display must be a boolean.') + end + + otherwise + error('RefineAlignments:BADARG', 'Unknown arguement: %s', ... + lower(varargin{i})) + + end +end + +if isempty(dist_mat) + if strcmpi(alpha, 'nt') + dist_mat = nuc44; + toint = @nt2int; + else + dist_mat = blosum50; + toint = @aa2int; + end +end + +dist_mat(end+1,:) = gap_penalty; +dist_mat(:,end+1) = gap_penalty; +dist_mat(end,end) = 0; + +switch upper(TYPE) + case 'MIXED' + is_stoch = true; + is_mix = true; + case 'STOCASTIC' + is_stoch = true; + is_mix = false; + case 'DETERMINISTIC' + is_stoch = false; + is_mix = false; +end + + +OUT_ALIGN = IN_ALIGN; +cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint); + +for i = 1:NUM_REPS + + %determine whether to switch based on the "mix" factor + if is_mix + is_stoch = ~is_stoch; + end + + + rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES); + counter = 1; + while counter < NUM_TRIES + new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha); + + new_score = CalculateScore(new_mat, dist_mat, toint); + if new_score > cur_score + if disp_flag + disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ... + num2str(abs(new_score-cur_score)), ' and by ', ... + num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ... + ' columns', ' Using stoch:', num2str(is_stoch)}; + display([disp_cell{:}]) + end + cur_score = new_score; + OUT_ALIGN = new_mat; + break + end + counter = counter + 1; + end +end + +function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES) +% GetRows +% A helper function which determines the rows to split the alignment +% with. +% +% MAT A char-array of the multiple alignment. +% STOCH A boolean indicating whether to use a stochastic approach. +% RM_FRAC The fraction of rows to remove. +% NUM_TRIES The number of tries to return. +% +% rows_mat A matrix indicating which rows to remove. +% + +if STOCH + rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC; + if any(all(rows_mat,1)) || any(all(~rows_mat,1)) + cols = any(all(rows_mat,1)) || any(all(~rows_mat,1)); + rows_mat(1,cols) = ~rows_mat(1,cols); + end + +else + gap_mask = MAT =='-'; + runs = FindRuns(gap_mask); + score = mode(runs.*(runs~=0))-min(runs); + [~, order] = sort(score, 'descend'); + rm_rows = order(1:NUM_TRIES); + [~, norder] = sort(runs(:,rm_rows),'descend'); + rows_mat = false(size(MAT,1), NUM_TRIES); + num_remove = ceil(RM_FRAC*size(MAT,1)); + for i = 1:NUM_TRIES + rows_mat(norder(1:num_remove,i),i) = true; + end +end + +function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha) +% SplitAndReAlign +% This will split the alignment matix based on the provided rows and +% then create two profiles and re-align them. +% +% MAT A char-array of the alignment. +% ROWS A boolean-array describing the switch between rows +% alpha Either 'NT' or 'AA' to indicate which sequence profile to +% create. +% +% NEW_MAT The ReAlinged alignment +% + +mat1 = MAT(ROWS, :); +mat2 = MAT(~ROWS, :); + +mat1(:,all(mat1 == '-',1)) = []; +mat2(:,all(mat2 == '-',1)) = []; + +pmat1 = seqprofile(mat1, 'alphabet', alpha); +pmat2 = seqprofile(mat2, 'alphabet', alpha); +try + [~, ind1, ind2] = profalign(pmat1, pmat2); +catch + ind1 = 1:size(pmat1,2); + ind2 = 1:size(pmat2,2); +end + +NEW_MAT = char(0); +NEW_MAT(ROWS, ind1) = mat1; +NEW_MAT(~ROWS, ind2) = mat2; +NEW_MAT(~isletter(NEW_MAT)) = '-'; +NEW_MAT(all(NEW_MAT == '-',1)) = []; + + + + +function s = CalculateScore(MAT, DIST, FUN) +% CalculateScore +% A function which calculates the score of the alignment. This score +% is based on the agreement between the consensus and each individual +% alignment. +% +% MAT A char-array of a multiple alignment. +% DIST A distance matrix which indicates the penalty for +% mismatches. +% FUN A function which converts letter to numbers such at nt2int +% or aa2int. +% +% s The score for this alingment. +% +% + +cons = seqconsensus(MAT); +num_cons = FUN(cons); +num_align = FUN(MAT); +ncons = repmat(num_cons(:), [size(num_align,1), 1]); +nalign = num_align(:); +lookupinds = sub2ind(size(DIST), ncons, nalign); +s = sum(DIST(lookupinds)); + +function reverse_looking = FindRuns(input) +% FindRuns +% Finds consecutive runs of 1's along the rows of a boolean array. +% +% INPUT = [1 1 1 0 0 0 1 0 1 0 1 1; +% 0 1 1 1 0 1 1 0 0 1 1 1]; +% RL = [1 2 3 0 0 0 1 0 1 0 1 2; +% [0 3 2 1 0 1 2 0 0 1 2 3]; +% + +[m,n] = size(input); +reverse_looking = [zeros(1,m);input.']; +reverse_looking = reverse_looking(:); +p = find(~reverse_looking); +reverse_looking(p) = [0;1-diff(p)]; +reverse_looking = reshape(cumsum(reverse_looking),[],m).'; +reverse_looking(:,1) = []; + + + diff --git a/test/entropy/don_anal.m b/test/entropy/don_anal.m new file mode 100644 index 0000000..7d0a648 --- a/dev/null +++ b/test/entropy/don_anal.m @@ -0,0 +1,40 @@ +d = fastaread('C:\dondata\FASTA2.fa'); +lens = arrayfun(@(x)(numel(x.Sequence)), d); +d(lens < 300) = []; +rexp = '/Human/(\w*)/([\d\w]*)/(.*?)/(\d{4})/'; +headers = cell(length(d), 1); +[headers{:}] = deal(d(:).Header); +groups = regexp(headers, rexp, 'tokens', 'once'); +cgroups = cat(1,groups{:}); +dates = unique(cgroups(:,end)); + +seqs = cell(length(d), 1); +[seqs{:}] = deal(d(:).Sequence); + +ALIGN_CELL = cell(length(dates), 1); +INDS_CELL = cell(length(dates), 1); +PAIRWISE_CELL = cell(length(dates),1); +for i = 1:length(dates) + display(dates{i}) + tf = strcmp(cgroups(:,end), dates{i}); + if isempty(PAIRWISE_CELL{i}) + pairwise_dists = GenomePairwiseDist(d(tf), dates{i}); + PAIRWISE_CELL{i} = pairwise_dists; + else + pairwise_dists = PAIRWISE_CELL{i}; + end + if isempty(ALIGN_CELL{i}) || isempty(INDS_CELL{i}) + [ALIGN_CELL{i}, INDS_CELL{i}] = GenomeAlignments(d(tf), ... + pairwise_dists); + save restore_data PAIRWISE_CELL ALIGN_CELL INDS_CELL + end +end + +for i = 1:length(ALIGN_CELL) + [ALIGN_CELL{i}, ~]= RefineAlignments(ALIGN_CELL{i}); +end + + +ents = CalculateProteinEntropy(d(1), ALIGN_CELL); + +dents = cat(1, ents{:}); diff --git a/test/entropy/nwalign_mod.m b/test/entropy/nwalign_mod.m new file mode 100644 index 0000000..76aade6 --- a/dev/null +++ b/test/entropy/nwalign_mod.m @@ -0,0 +1,637 @@ +function [score, alignment, startat, matrices] = nwalign(seq1,seq2,varargin) +%NWALIGN performs Needleman-Wunsch global alignment of two sequences. +% +% NWALIGN(SEQ1, SEQ2) returns the score (in bits) for the optimal +% alignment. Note: The scale factor used to calculate the score is +% provided by the scoring matrix info (see below). If this is not +% defined, then NWALIGN returns the raw score. +% +% [SCORE, ALIGNMENT] = NWALIGN(SEQ1, SEQ2) returns a string showing an +% optimal global alignment of amino acid (or nucleotide) sequences SEQ1 +% and SEQ2. +% +% [SCORE, ALIGNMENT, STARTAT] = NWALIGN(SEQ1, SEQ2) returns a 2x1 vector +% with the starting point indices indicating the starting point of the +% alignment in the two sequences. Note: this output is for consistency +% with SWALIGN and will always be [1;1] because this is a global +% alignment. +% +% NWALIGN(..., 'ALPHABET', A) specifies whether the sequences are +% amino acids ('AA') or nucleotides ('NT'). The default is AA. +% +% NWALIGN(..., 'SCORINGMATRIX', matrix) defines the scoring matrix to be +% used for the alignment. The default is BLOSUM50 for AA or NUC44 for NT. +% +% NWALIGN(..., 'SCALE' ,scale) indicates the scale factor of the scoring +% matrix to return the score using arbitrary units. If the scoring matrix +% Info also provides a scale factor, then both are used. +% +% NWALIGN(..., 'GAPOPEN', penalty) defines the penalty for opening a gap +% in the alignment. The default gap open penalty is 8. +% +% NWALIGN(..., 'EXTENDGAP', penalty) defines the penalty for extending a +% gap in the alignment. If EXTENDGAP is not specified, then extensions to +% gaps are scored with the same value as GAPOPEN. +% +% NWALIGN(..., 'SHOWSCORE', true) displays the scoring space and the +% winning path. +% +% +% Examples: +% +% % Return the score in bits and the global alignment using the +% % default scoring matrix (BLOSUM50). +% [score, align] = nwalign('VSPAGMASGYD', 'IPGKASYD') +% +% % Use user-specified scoring matrix and "gap open" penalty. +% [score, align] = nwalign('IGRHRYHIGG', 'SRYIGRG',... +% 'scoringmatrix', @pam250, 'gapopen',5) +% +% % Return the score in nat units (nats). +% [score, align] = nwalign('HEAGAWGHEE', 'PAWHEAE', 'scale', log(2)) +% +% % Display the scoring space and the winning path. +% nwalign('VSPAGMASGYD', 'IPGKASYD', 'showscore', true) +% +% See also BLOSUM, MULTIALIGN, NT2AA, PAM, PROFALIGN, SEQDOTPLOT, +% SHOWALIGNMENT, SWALIGN. + +% References: +% R. Durbin, S. Eddy, A. Krogh, and G. Mitchison. Biological Sequence +% Analysis. Cambridge UP, 1998. +% Needleman, S. B., Wunsch, C. D., J. Mol. Biol. (1970) 48:443-453 + +% Copyright 2002-2006 The MathWorks, Inc. +% $Revision: 1.22.6.10 $ $Date: 2006/05/17 20:48:28 $ + +gapopen = -8; +gapextend = -8; +setGapExtend = false; +showscore=false; +isAminoAcid = true; +scale=1; + +% If the input is a structure then extract the Sequence data. +if isstruct(seq1) + try + seq1 = seqfromstruct(seq1); + catch + rethrow(lasterror); + end +end +if isstruct(seq2) + try + seq2 = seqfromstruct(seq2); + catch + rethrow(lasterror); + end +end +if nargin > 2 + if rem(nargin,2) == 1 + error('Bioinfo:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'scoringmatrix','gapopen','extendgap','alphabet','scale','showscore'}; + for j=1:2:nargin-2 + pname = varargin{j}; + pval = varargin{j+1}; + k = strmatch(lower(pname), okargs); %#ok + if isempty(k) + error('Bioinfo:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % scoring matrix + if isnumeric(pval) + ScoringMatrix = pval; + else + if ischar(pval) + pval = lower(pval); + end + try + [ScoringMatrix,ScoringMatrixInfo] = feval(pval); + catch + error('Bioinfo:InvalidScoringMatrix','Invalid scoring matrix.'); + end + end + case 2 %gap open penalty + gapopen = -pval; + case 3 %gap extend penalty + gapextend = -pval; + setGapExtend = true; + case 4 %if sequence is nucleotide + if strcmpi(pval,'nt') + isAminoAcid = false; + end + case 5 % scale + scale=pval; + case 6 % showscore + showscore = pval == true; + end + end + end +end + +% setting the default scoring matrix +if ~exist('ScoringMatrix','var') + if isAminoAcid + [ScoringMatrix,ScoringMatrixInfo] = blosum50; + else + [ScoringMatrix,ScoringMatrixInfo] = nuc44; + end +end + + +% getting the scale from ScoringMatrixInfo, if it exists +if exist('ScoringMatrixInfo','var') && isfield(ScoringMatrixInfo,'Scale') + scale=scale*ScoringMatrixInfo.Scale; +end + +% handle properly "?" characters typically found in pdb files +if isAminoAcid + if ischar(seq1) + seq1 = strrep(seq1,'?','X'); + else + seq1(seq1 == 26) = 23; + end + if ischar(seq2) + seq2 = strrep(seq2,'?','X'); + else + seq2(seq2 == 26) = 23; + end +end + +% check input sequences +% if isAminoAcid && ~(isaa(seq1) && isaa(seq2)) +% error('Bioinfo:InvalidAminoAcidSequences',... +% 'Both sequences must be amino acids, use ALPHABET = ''NT'' for aligning nucleotides.'); +% elseif ~isAminoAcid && ~(isnt(seq1) && isnt(seq2)) +% error('Bioinfo:InvalidNucleotideSequences',... +% 'When ALPHABET = ''NT'', both sequences must be nucleotides.'); +% end + +% use numerical arrays for easy indexing +if ischar(seq1) + seq1=upper(seq1); %the output alignment will be all uppercase + if isAminoAcid + intseq1 = aa2int(seq1); + else + intseq1 = nt2int(seq1); + end +else + intseq1=seq1; + if isAminoAcid + seq1 = int2aa(intseq1); + else + seq1 = int2nt(intseq1); + end +end +if ischar(seq2) + seq2=upper(seq2); %the output alignment will be all uppercase + if isAminoAcid + intseq2 = aa2int(seq2); + else + intseq2 = nt2int(seq2); + end +else + intseq2=seq2; + if isAminoAcid + seq2 = int2aa(intseq2); + else + seq2 = int2nt(intseq2); + end +end + + +m = length(seq1); +n = length(seq2); +if ~n||~m + error('Bioinfo:InvalidLengthSequences','Length of input sequences must be greater than 0'); +end + +% If unknown, ambiguous or gaps appear in the sequence, we need to make +% sure that ScoringMatrix can handle them. + +% possible values are +% B Z X * - ? +% 21 22 23 24 25 26 + +scoringMatrixSize = size(ScoringMatrix,1); + +highestVal = max([intseq1, intseq2]); +if highestVal > scoringMatrixSize + % if the matrix contains the 'Any' we map to that + if isAminoAcid + anyVal = aa2int('X'); + else + anyVal = nt2int('N'); + end + if scoringMatrixSize >= anyVal + intseq1(intseq1>scoringMatrixSize) = anyVal; + intseq2(intseq2>scoringMatrixSize) = anyVal; + else + error('Bioinfo:InvalidSymbolsInInputSequeces',... + 'Sequences contain symbols that cannot be handled by the given scoring matrix.'); + end +end + +if setGapExtend % call more complicated algorithm if we have + [F, pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend); +else +% [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen); + scoredMatchMat=[ScoringMatrix(intseq2,intseq1(:)) zeros(n,1); zeros(1,m+1)]; + [F pointer]=FastNWalign2(scoredMatchMat,gapopen); +end + +% trace back through the pointer matrix +halfn=ceil((n+1)/2); +halfm=ceil((m+1)/2); + +i = n+1; j = m+1; +path = zeros(n+m,2); +step = 1; + +score = max(F(n+1,m+1,:)); + +if setGapExtend + + if F(n+1,m+1,3)==score % favor with left-gap + laststate=3; + elseif F(n+1,m+1,2)==score % then with up-gap + laststate=2; + else % at last with match + laststate=1; + end + + while (i>halfn && j>halfm) % in the rigth half favor gaps when several + % paths lead to the highest score + state=laststate; + if bitget(pointer(i,j,state),3) + laststate=3; + elseif bitget(pointer(i,j,state),2) + laststate=2; + else + laststate=1; + end + + switch state + case 1 % is diagonal + j = j - 1; + i = i - 1; + path(step,:) = [j,i]; + case 2 % is up + i = i - 1; + path(step,2) = i; + case 3 % is left + j = j - 1; + path(step,1) = j; + end + step = step +1; + end + + while (i>1 || j>1) % in the rigth half favor matchs when several + % paths lead to the highest score + state=laststate; + if bitget(pointer(i,j,state),1) + laststate=1; + elseif bitget(pointer(i,j,state),2) + laststate=2; + else + laststate=3; + end + + switch state + case 1 % is diagonal + j = j - 1; + i = i - 1; + path(step,:) = [j,i]; + case 2 % is up + i = i - 1; + path(step,2) = i; + case 3 % is left + j = j - 1; + path(step,1) = j; + end + step = step +1; + end + +else % ~setGapExtend + + while (i > 1 || j > 1) + + switch pointer(i,j) + case 1 % diagonal only + j = j - 1; + i = i - 1; + path(step,:) = [j,i]; + case 2 % up only + i = i - 1; + path(step,2) = i; + case 4 % left only + j = j - 1; + path(step,1) = j; + case 6 % up or left --> up (favors gaps in seq2) + j = j - 1; + path(step,1) = j; + otherwise %3 diagonal or up --> diagonal (favors no gaps) + %4 diagonal or left --> diagonal (favors no gaps) + %7 diagonal or left or up --> diagonal (favors no gaps) + j = j - 1; + i = i - 1; + path(step,:) = [j,i]; + end + step = step +1; + end + +end % if setGapExtend + +% re-scaling the output score +score = scale * score; + +if nargout<=1 && ~showscore + return +end + +path(step:end,:) = []; % step-1 is the length of the alignment +path = flipud(path); + +% setting the size of the alignment +alignment = repmat(('- -')',1,step-1); + +% adding sequence to alignment +alignment(1,path(:,1)>0) = seq1; +alignment(3,path(:,2)>0) = seq2; + +% find locations where there are no gaps +h=find(all(path>0,2)); +if isAminoAcid + noGaps1=aa2int(alignment(1,h)); + noGaps2=aa2int(alignment(3,h)); +else + noGaps1=nt2int(alignment(1,h)); + noGaps2=nt2int(alignment(3,h)); +end + +% erasing symbols that can not be scored +htodel=max([noGaps1;noGaps2])>scoringMatrixSize; +h(htodel)=[]; +noGaps1(htodel)=[]; +noGaps2(htodel)=[]; +% score pairs with no gap +value = ScoringMatrix(sub2ind(size(ScoringMatrix),double(noGaps1),double(noGaps2))); + +% insert symbols of the match string into the alignment +alignment(2,h(value>=0)) = ':'; +alignment(2,h(noGaps1==noGaps2)) = '|'; + +startat = [1;1]; + +% undocumented fourth output -- score and pointer matrices +if nargout > 3 + matrices.Scores = F; + matrices.Pointers = pointer; +end + +if showscore + figure + F=scale.*max(F(2:end,2:end,:),[],3); + clim=max(max(max(abs(F(~isinf(F))))),eps); + imagesc(F,[-clim clim]); + colormap(privateColorMap(1)); + set(colorbar,'YLim',[min([F(:);-eps]) max([F(:);eps])]) + title('Score for best path') + xlabel('Sequence 1') + ylabel('Sequence 2') + hold on + plot(path(all(path>0,2),1),path(all(path>0,2),2),'k.') +end +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% function [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gap) +% % Standard Needleman-Wunsch algorithm +% +% %LOOP IMPLEMENTED IN FastNWalign2.c +% +% % set up storage for dynamic programming matrix +% F = zeros(n+1,m+1); +% F(2:end,1) = gap * (1:n)'; +% F(1,2:end) = gap * (1:m); +% +% +% %and for the back tracing matrix +% pointer= repmat(uint8(4),n+1,m+1); +% pointer(:,1) = 2; % up +% pointer(1,1) = 1; +% +% +% %initialize buffers to the first column +% ptr = pointer(:,2); % ptr(1) is always 4 +% currentFColumn = F(:,1); +% +% +% %main loop runs through the matrix looking for maximal scores +% +% for outer = 2:m+1 +% +% %score current column +% scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1)); +% %grab the data from the matrices and initialize some values +% lastFColumn = F(:,outer-1); +% currentFColumn = F(:,outer); +% best = currentFColumn(1); +% +% %[F(:,outer) pointer(:,outer)]=FastNWalign(lastFColumn,currentFColumn,scoredMatchColumn,gap); +% +% for inner = 2:n+1 +% % score the three options +% up = best + gap; +% left = lastFColumn(inner) + gap; +% diagonal = lastFColumn(inner-1) + scoredMatchColumn(inner-1); +% +% % max could be used here but it is quicker to use if statements +% if up > left +% best = up; +% pos = 2; +% else +% best = left; +% pos = 4; +% end +% +% if diagonal >= best +% best = diagonal; +% ptr(inner) = 1; +% else +% ptr(inner) = pos; +% end +% currentFColumn(inner) = best; +% +% end % inner +% %put back updated columns +% +% % save columns of pointers +% F(:,outer) = currentFColumn; +% % save columns of pointers +% pointer(:,outer) = ptr; +% +% end % outer +% +% end +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + function [F,pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend) + % Needleman-Wunsch algorithm modified to handle affine gaps + + % Set states + inAlign = 1; + inGapUp = 2; + inGapLeft = 3; + numStates = 3; + + % Set up storage for dynamic programming matrix: + % for keeping the maximum scores for every state + + F = zeros(n+1,m+1,numStates); + F(:,1,:) = -inf; + F(1,:,:) = -inf; + F(1,1,inAlign) = 0; + + F(2:end,1,inGapUp) = gapopen + gapextend * (0:n-1)'; + F(1,2:end,inGapLeft) = gapopen + gapextend * (0:m-1); + + % and for the back tracing pointers + pointer(n+1,m+1,numStates) = uint8(0); + pointer(2:end,1,inGapUp) = 2; % up + pointer(1,2:end,inGapLeft) = 4; % left + + % initialize buffers to the first column + ptrA = pointer(:,1,inAlign); + ptrU = pointer(:,1,inGapLeft); + ptrL = pointer(:,1,inGapUp); + + currentFColumnA = F(:,1,inAlign); + currentFColumnU = F(:,1,inGapUp); + currentFColumnL = F(:,1,inGapLeft); + + % main loop runs through the matrix looking for maximal scores + for outer = 2:m+1 + % score current column + scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1)); + % grab the data from the matrices and initialize some values for the + % first row the most orderly possible + lastFColumnA = currentFColumnA; + currentFColumnA = F(:,outer,inAlign); + bestA = currentFColumnA(1); + currentinA = lastFColumnA(1); + + lastFColumnU = currentFColumnU; + currentFColumnU = F(:,outer,inGapUp); + bestU = currentFColumnU(1); + + lastFColumnL = currentFColumnL; + currentFColumnL = F(:,outer,inGapLeft); + currentinGL = lastFColumnL(1); + + for inner = 2:n+1 + + % grab the data from the columns the most orderly possible + upOpen = bestA + gapopen; + inA = currentinA; + currentinA = lastFColumnA(inner); + leftOpen = currentinA + gapopen; + + inGL = currentinGL; + currentinGL = lastFColumnL(inner); + leftExtend = currentinGL + gapextend; + + upExtend = bestU + gapextend; + inGU = lastFColumnU(inner-1); + + % operate state 'inGapUp' + + if upOpen > upExtend + bestU = upOpen; ptr = 1; % diagonal + elseif upOpen < upExtend + bestU = upExtend; ptr = 2; % up + else % upOpen == upExtend + bestU = upOpen; ptr = 3; % diagonal and up + end + currentFColumnU(inner)=bestU; + ptrU(inner)=ptr; + + % operate state 'inGapLeft' + + if leftOpen > leftExtend + bestL = leftOpen; ptr = 1; % diagonal + elseif leftOpen < leftExtend + bestL = leftExtend; ptr = 4; % left + else % leftOpen == leftExtend + bestL = leftOpen; ptr = 5; % diagonal and left + end + currentFColumnL(inner) = bestL; + ptrL(inner) = ptr; + + % operate state 'inAlign' + + if inA > inGU + if inA > inGL + bestA = inA; ptr = 1; % diagonal + elseif inGL > inA + bestA = inGL; ptr = 4; % left + else + bestA = inA; ptr = 5; % diagonal and left + end + elseif inGU > inA + if inGU > inGL + bestA = inGU; ptr = 2; % up + elseif inGL > inGU + bestA = inGL; ptr = 4; % left + else + bestA = inGU; ptr = 6; % up & left + end + else + if inA > inGL + bestA = inA; ptr = 3; % diagonal & up + elseif inGL > inA + bestA = inGL; ptr = 4; % left + else + bestA = inA; ptr = 7; % all + end + end + + bestA = bestA + scoredMatchColumn(inner-1); + currentFColumnA(inner) = bestA; + ptrA(inner) = ptr; + + end %inner + + % put back updated columns + F(:,outer,inGapLeft) = currentFColumnL; + F(:,outer,inGapUp) = currentFColumnU; + F(:,outer,inAlign) = currentFColumnA; + % save columns of pointers + pointer(:,outer,inAlign) = ptrA; + pointer(:,outer,inGapUp) = ptrU; + pointer(:,outer,inGapLeft)= ptrL; + end %outer + end + function pcmap = privateColorMap(selection) + %PRIVATECOLORMAP returns a custom color map + switch selection + case 1, pts = [0 0 .3 20; + 0 .1 .8 25; + 0 .9 .5 15; + .9 1 .9 8; + 1 1 0 26; + 1 0 0 26; + .4 0 0 0]; + otherwise, pts = [0 0 0 128; 1 1 1 0]; + end + xcl=1; + for i=1:size(pts,1)-1 + xcl=[xcl,i+1/pts(i,4):1/pts(i,4):i+1]; + end + pcmap = interp1(pts(:,1:3),xcl); + end +end + -- cgit v0.8.3.1-22-g547a