summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-08 17:20:16 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-08 17:20:16 (GMT)
commit170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf (patch) (unidiff)
treebcfea699462d48906f11dfc5fb4d36725bc12710
parent15b4e1318130d4c49109746efecc1018dce54da9 (diff)
downloadexp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.zip
exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.tar.gz
exp005-170d9ee069ea4d6e8f05cd2a5ac3ed6133d600bf.tar.bz2
Will Dampier's Matlab code for analyzing the entropy of the collection by sequence position and time.
-rw-r--r--test/entropy/Align2Ref.m24
-rw-r--r--test/entropy/CalculateEntropy.m15
-rw-r--r--test/entropy/CalculateProteinEntropy.m25
-rw-r--r--test/entropy/FastNWalign2.c94
-rw-r--r--test/entropy/GenomeAlignments.m31
-rw-r--r--test/entropy/GenomePairwiseDist.m98
-rw-r--r--test/entropy/RefineAlignments.m276
-rw-r--r--test/entropy/don_anal.m40
-rw-r--r--test/entropy/nwalign_mod.m637
9 files changed, 1240 insertions, 0 deletions
diff --git a/test/entropy/Align2Ref.m b/test/entropy/Align2Ref.m
new file mode 100644
index 0000000..3021d4a
--- a/dev/null
+++ b/test/entropy/Align2Ref.m
@@ -0,0 +1,24 @@
1function NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS, ALPHA)
2% Align2Ref
3% Uses a profile alignment to change a set of indicices in the
4% "alignment space" to the "reference space".
5%
6%
7% NEW_INDS = Align2Ref(REF_SEQ, ALIGN, ALIGN_INDS)
8%
9
10if nargin == 3
11 ALPHA = 'aa';
12end
13
14ref_prof = seqprofile(REF_SEQ, 'alphabet', ALPHA);
15align_prof = seqprofile(ALIGN, 'alphabet', ALPHA);
16
17[~, ind1, ind2] = profalign(ref_prof, align_prof);
18
19ninds = ind2(ALIGN_INDS);
20
21mask = sum(bsxfun(@le, ind1(:)', ninds(:)),2);
22mask(mask == 0) = 1;
23
24NEW_INDS = reshape(ind1(mask), size(ALIGN_INDS)); \ No newline at end of file
diff --git a/test/entropy/CalculateEntropy.m b/test/entropy/CalculateEntropy.m
new file mode 100644
index 0000000..18bfda5
--- a/dev/null
+++ b/test/entropy/CalculateEntropy.m
@@ -0,0 +1,15 @@
1function OUT_VEC = CalculateEntropy(ARRAY)
2% CalculateEntropy
3% Calculate the Informationational Entropy of each position in a
4% multialignment. In this function I remove all gap characters from
5% each column of the alignment before doing the calculation.
6%
7%
8%
9
10num_array = aa2int(ARRAY);
11bins = histc(num_array, 0:21, 1);
12
13probs=bsxfun(@rdivide, bins(2:end-1,:),sum(bins(2:end-1,:)));
14
15OUT_VEC = -nansum(log(probs).*double(probs~=0).*probs); \ No newline at end of file
diff --git a/test/entropy/CalculateProteinEntropy.m b/test/entropy/CalculateProteinEntropy.m
new file mode 100644
index 0000000..41f06f5
--- a/dev/null
+++ b/test/entropy/CalculateProteinEntropy.m
@@ -0,0 +1,25 @@
1function OUT_DATA=CalculateProteinEntropy(Ref, ALIGN_CELL)
2
3OUT_DATA = cell(size(ALIGN_CELL));
4for i = 1:size(ALIGN_CELL,1)
5
6 aligned_seqs = ALIGN_CELL{i};
7
8 ref_seq = Ref.Sequence;
9
10 ref_prof = seqprofile(ref_seq, 'alphabet', 'aa');
11 seqs_prof = seqprofile(aligned_seqs, 'alphabet', 'aa');
12
13 [mp, H1, H2] = profalign(ref_prof, seqs_prof);
14
15 merged_seqs = char(zeros(size(aligned_seqs,1)+1, size(mp,2)));
16 merged_seqs(2:end, H2) = aligned_seqs;
17 merged_seqs(1, H1) = ref_seq;
18
19 entropy_vals = CalculateEntropy(merged_seqs);
20
21 %only pull out the values that are in the reference positions
22 ref_entropy = entropy_vals(H1);
23
24 OUT_DATA{i} = ref_entropy;
25end
diff --git a/test/entropy/FastNWalign2.c b/test/entropy/FastNWalign2.c
new file mode 100644
index 0000000..33b286d
--- a/dev/null
+++ b/test/entropy/FastNWalign2.c
@@ -0,0 +1,94 @@
1#include "mex.h"
2#include "matrix.h"
3
4void simplegap(double scoredMatchMat[],
5const double gap, int n, int m, double output_F[], double output_P[])
6{
7 // Standard Needleman-Wunsch algorithm
8
9 double up, left, diagonal, best, pos;
10
11
12 int i,j;
13
14
15 for(i=0;i<m;i++)
16 {
17 output_F[i]=i*gap;
18 output_P[i]=2;
19 }
20
21
22 for(j=0; j<n; j++) //put initial values in
23 {
24 output_F[j*m]=j*gap;
25 output_P[j*m]=4;
26 }
27 output_P[0]=1;
28
29
30
31 for(j=1;j<n;j++) //cycle through columns
32 {
33 best=output_F[(j)*m];
34
35 for(i=1; i<m; i++) //cycle through the rows
36 {
37 up=best+gap;
38 left=output_F[i+(j-1)*m]+gap;
39 diagonal=output_F[i-1+(j-1)*m]+scoredMatchMat[i-1+(j-1)*m];
40
41 if (up>left)
42 {
43 best=up;
44 pos=2;
45 }
46 else
47 {
48 best=left;
49 pos=4;
50 }
51
52 if (diagonal>=best)
53 {
54 best=diagonal;
55 output_P[i+j*m]=1;
56 }
57 else
58 {
59 output_P[i+j*m]=pos;
60 }
61 output_F[i+j*m]=best;
62 }
63 }
64}
65
66void mexFunction(int nlhs, mxArray *plhs[],
67int nrhs, const mxArray *prhs[])
68{
69
70 double *gap, *output_F, *output_P;
71 double *scoredMatchMat;
72 int n, m;
73 mwSize i;
74
75 //double *F_col, *ptr_col;
76 m=mxGetM(prhs[0]);
77 n=mxGetN(prhs[0]);
78 gap=mxGetPr(prhs[1]);
79
80 plhs[0]=mxCreateDoubleMatrix(m,n,mxREAL);
81 plhs[1]=mxCreateDoubleMatrix(m,n,mxREAL);
82
83 output_F=mxGetPr(plhs[0]);
84 output_P=mxGetPr(plhs[1]);
85
86
87 scoredMatchMat=mxGetPr(prhs[0]);
88 //
89
90
91 simplegap(scoredMatchMat,*gap,n,m,output_F,output_P);
92
93}
94
diff --git a/test/entropy/GenomeAlignments.m b/test/entropy/GenomeAlignments.m
new file mode 100644
index 0000000..be56570
--- a/dev/null
+++ b/test/entropy/GenomeAlignments.m
@@ -0,0 +1,31 @@
1function [ALIGN inds] = GenomeAlignments(IN_GENOMES, DIST_MAT)
2% GenomeAlignments
3% This will align all of the protein segments in the genome structure
4% and return a cell-array. Each cell is the alignment of one
5% protein. If the protein is missing then the distance is NaN.
6%
7
8[tree inds] = MakeTree(DIST_MAT);
9ALIGN = malign(tree, inds);
10
11 function align=malign(tree, inds)
12 seqs = arrayfun(@(x)(x.Sequence), IN_GENOMES(inds), 'uniformoutput', false);
13 align = multialign(seqs, tree);
14 end
15
16
17
18 function [tree_obj inds]=MakeTree(dist_mat)
19 % MakeTree
20 % Takes a distance matrix and returns a tree object and the indicies
21 % to the rows that are in the tree.
22 %
23
24 inds = find(~all(isnan(dist_mat)|isinf(dist_mat)|dist_mat==0));
25 dvec = squareform(dist_mat(inds, inds));
26 dvec(dvec <= 0) = rand(nnz(dvec <= 0),1)*min(dvec(dvec>0));
27 tree_obj = seqlinkage(dvec);
28
29 end
30
31end \ No newline at end of file
diff --git a/test/entropy/GenomePairwiseDist.m b/test/entropy/GenomePairwiseDist.m
new file mode 100644
index 0000000..544db0a
--- a/dev/null
+++ b/test/entropy/GenomePairwiseDist.m
@@ -0,0 +1,98 @@
1function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME)
2% GenomePairwiseDist
3% This will align all of the protein segments in the genome structure
4% and return a cell-array. Each cell is the alignment of one
5% protein. If the protein is missing then the distance is NaN.
6%
7
8if nargin < 1
9 ALIGN_NAME = 'saved_alignments';
10end
11
12try
13 display('Loading Previous Alignments')
14 d = load(ALIGN_NAME);
15 saved_dists = d.DIST_CELL;
16 saved_headers = d.header_names;
17catch %#ok<CTCH>
18 display('No previous alignments found')
19 saved_headers = {};
20 saved_dists = zeros(numel(IN_PROTS));
21end
22
23NUM_LETTERS = 4000000;
24
25
26[header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header);
27
28display('Starting Alignments')
29
30dist_mat = NaN(size(IN_PROTS,1));
31
32[tf loc]=ismember(header_names, saved_headers);
33
34dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf));
35[I J] = find(triu(isnan(dist_mat)));
36inds = sub2ind(size(dist_mat), I, J);
37
38sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS);
39
40%find all of the ones that are not worth aligning
41bad_mask = sizes(I) <= 1 | sizes(J) <= 1;
42dist_mat(inds(bad_mask)) = Inf;
43I(bad_mask) = [];
44J(bad_mask) = [];
45
46p_sizes = sizes(I)+sizes(J);
47counter = 1;
48display('Aligning...')
49tic
50while counter < length(I)
51 cum_sizes = cumsum(p_sizes(counter+1:end));
52 top = counter+find(cum_sizes<NUM_LETTERS,1, 'last');
53
54 I_spots = I(counter:top);
55 J_spots = J(counter:top);
56 need_align = cell(length(I_spots),2);
57 [need_align{:,1}] = deal(IN_PROTS(I_spots).Sequence);
58 [need_align{:,2}] = deal(IN_PROTS(J_spots).Sequence);
59 dist = zeros(size(I_spots));
60
61 %parfor k = 1:size(need_align,1)
62 for k = 1:size(need_align,1)
63 %display([num2str(k) ' ' num2str(toc)])
64 try
65 dist(k)=nwalign_mod(need_align{k,:});
66 catch %#ok<CTCH>
67 dist(k) = Inf;
68 end
69 end
70 time = toc;
71 inds = sub2ind(size(dist_mat), I_spots,J_spots);
72 dist_mat(inds) = dist;
73 counter = top+1;
74 display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds'])
75end
76
77DIST_CELL = dist_mat;
78save(ALIGN_NAME, 'DIST_CELL', 'header_names')
79DIST_OUT = dfun(DIST_CELL);
80
81
82
83
84
85function out_d = dfun(MAT)
86
87score12 = triu(MAT,1);
88self_d = diag(MAT);
89d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)'));
90
91out_d = triu(d,1) + triu(d,1)';
92
93
94
95
96
97
98
diff --git a/test/entropy/RefineAlignments.m b/test/entropy/RefineAlignments.m
new file mode 100644
index 0000000..0e69275
--- a/dev/null
+++ b/test/entropy/RefineAlignments.m
@@ -0,0 +1,276 @@
1function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin)
2% RefineAlignments
3% Uses an interative algorithm to improve the multiple alignments of
4% an input sequence. It will iteratively remove a fraction of the
5% sequences, compress any gaps, and then re-align. This will be done
6% for NUM_REPS.
7%
8% Written by Will Dampier. Contact at wnd22@drexel.edu
9% Version 1.0 on 10/23/09
10%
11% [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN)
12%
13% IN_ALIGN A char-array representing a multiple alignment to be
14% refined.
15%
16% OUT_ALIGN The refined alignment.
17% SCORE The new alignment score.
18%
19%
20% Optional Arguements:
21%
22% 'Alphabet' Either 'AA' or 'NT' to indicate whether this is a
23% nucleotide or amino acid alignment.
24%
25% 'GapPenalty' The penalty for gaps. This is set at -8 by
26% default.
27%
28% 'DistMat' The distance matrix to use for the alignment
29% scoring. Default is BLOSUM50 for 'AA' and NUC44
30% for 'NT'
31%
32% 'NumReps' The number of refining iterations to do.
33%
34% 'NumTry' The number of tries to perform at each iteration.
35%
36% 'RefineType' Which type of refinement to perform. This can
37% be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'.
38%
39% 'Display' A boolean indicating whether to display the output
40% at each iteration.
41%
42%
43
44
45
46
47RM_FRAC = 0.1;
48NUM_TRIES = 10;
49NUM_REPS = 100;
50alpha = 'aa';
51dist_mat = [];
52gap_penalty = -8;
53TYPE = 'mixed';
54disp_flag = true;
55
56for i = 1:2:length(varargin)
57 switch lower(varargin{i})
58
59 case 'alphabet'
60 if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1})
61 alpha = varargin{i+1};
62 else
63 error('RefineAlignments:BADALPHA', ...
64 'Arguement to "alphabet" must be "NT" or "AA"')
65 end
66 case 'gappenalty'
67 if isnumeric(varargin{i+1}) && varargin{i+1} < 0
68 gap_penalty = varargin{i+1};
69 else
70 error('RefineAlignments:BADGAP', ...
71 'Arguement to "GapPenalty" must be negative numeric')
72 end
73 case 'distmat'
74 dist_mat = varargin{i+1};
75
76 case 'numreps'
77 if isnumeric(varargin{i+1}) && varargin{i+1} > 1
78 NUM_REPS = varargin{i+1};
79 else
80 error('RefineAlignments:BADREPS', ...
81 'Arguement to NumReps must be a positive numeric')
82 end
83
84 case 'refinetype'
85 TYPE = varargin{i+1};
86
87 case 'display'
88 if islogical(varargin{i+1})
89 disp_flag = varargin{i+1};
90 else
91 error('RefineAlignments:DISPLAY', ...
92 'Arguement to Display must be a boolean.')
93 end
94
95 otherwise
96 error('RefineAlignments:BADARG', 'Unknown arguement: %s', ...
97 lower(varargin{i}))
98
99 end
100end
101
102if isempty(dist_mat)
103 if strcmpi(alpha, 'nt')
104 dist_mat = nuc44;
105 toint = @nt2int;
106 else
107 dist_mat = blosum50;
108 toint = @aa2int;
109 end
110end
111
112dist_mat(end+1,:) = gap_penalty;
113dist_mat(:,end+1) = gap_penalty;
114dist_mat(end,end) = 0;
115
116switch upper(TYPE)
117 case 'MIXED'
118 is_stoch = true;
119 is_mix = true;
120 case 'STOCASTIC'
121 is_stoch = true;
122 is_mix = false;
123 case 'DETERMINISTIC'
124 is_stoch = false;
125 is_mix = false;
126end
127
128
129OUT_ALIGN = IN_ALIGN;
130cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint);
131
132for i = 1:NUM_REPS
133
134 %determine whether to switch based on the "mix" factor
135 if is_mix
136 is_stoch = ~is_stoch;
137 end
138
139
140 rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES);
141 counter = 1;
142 while counter < NUM_TRIES
143 new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha);
144
145 new_score = CalculateScore(new_mat, dist_mat, toint);
146 if new_score > cur_score
147 if disp_flag
148 disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ...
149 num2str(abs(new_score-cur_score)), ' and by ', ...
150 num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ...
151 ' columns', ' Using stoch:', num2str(is_stoch)};
152 display([disp_cell{:}])
153 end
154 cur_score = new_score;
155 OUT_ALIGN = new_mat;
156 break
157 end
158 counter = counter + 1;
159 end
160end
161
162function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES)
163% GetRows
164% A helper function which determines the rows to split the alignment
165% with.
166%
167% MAT A char-array of the multiple alignment.
168% STOCH A boolean indicating whether to use a stochastic approach.
169% RM_FRAC The fraction of rows to remove.
170% NUM_TRIES The number of tries to return.
171%
172% rows_mat A matrix indicating which rows to remove.
173%
174
175if STOCH
176 rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC;
177 if any(all(rows_mat,1)) || any(all(~rows_mat,1))
178 cols = any(all(rows_mat,1)) || any(all(~rows_mat,1));
179 rows_mat(1,cols) = ~rows_mat(1,cols);
180 end
181
182else
183 gap_mask = MAT =='-';
184 runs = FindRuns(gap_mask);
185 score = mode(runs.*(runs~=0))-min(runs);
186 [~, order] = sort(score, 'descend');
187 rm_rows = order(1:NUM_TRIES);
188 [~, norder] = sort(runs(:,rm_rows),'descend');
189 rows_mat = false(size(MAT,1), NUM_TRIES);
190 num_remove = ceil(RM_FRAC*size(MAT,1));
191 for i = 1:NUM_TRIES
192 rows_mat(norder(1:num_remove,i),i) = true;
193 end
194end
195
196function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha)
197% SplitAndReAlign
198% This will split the alignment matix based on the provided rows and
199% then create two profiles and re-align them.
200%
201% MAT A char-array of the alignment.
202% ROWS A boolean-array describing the switch between rows
203% alpha Either 'NT' or 'AA' to indicate which sequence profile to
204% create.
205%
206% NEW_MAT The ReAlinged alignment
207%
208
209mat1 = MAT(ROWS, :);
210mat2 = MAT(~ROWS, :);
211
212mat1(:,all(mat1 == '-',1)) = [];
213mat2(:,all(mat2 == '-',1)) = [];
214
215pmat1 = seqprofile(mat1, 'alphabet', alpha);
216pmat2 = seqprofile(mat2, 'alphabet', alpha);
217try
218 [~, ind1, ind2] = profalign(pmat1, pmat2);
219catch
220 ind1 = 1:size(pmat1,2);
221 ind2 = 1:size(pmat2,2);
222end
223
224NEW_MAT = char(0);
225NEW_MAT(ROWS, ind1) = mat1;
226NEW_MAT(~ROWS, ind2) = mat2;
227NEW_MAT(~isletter(NEW_MAT)) = '-';
228NEW_MAT(all(NEW_MAT == '-',1)) = [];
229
230
231
232
233function s = CalculateScore(MAT, DIST, FUN)
234% CalculateScore
235% A function which calculates the score of the alignment. This score
236% is based on the agreement between the consensus and each individual
237% alignment.
238%
239% MAT A char-array of a multiple alignment.
240% DIST A distance matrix which indicates the penalty for
241% mismatches.
242% FUN A function which converts letter to numbers such at nt2int
243% or aa2int.
244%
245% s The score for this alingment.
246%
247%
248
249cons = seqconsensus(MAT);
250num_cons = FUN(cons);
251num_align = FUN(MAT);
252ncons = repmat(num_cons(:), [size(num_align,1), 1]);
253nalign = num_align(:);
254lookupinds = sub2ind(size(DIST), ncons, nalign);
255s = sum(DIST(lookupinds));
256
257function reverse_looking = FindRuns(input)
258% FindRuns
259% Finds consecutive runs of 1's along the rows of a boolean array.
260%
261% INPUT = [1 1 1 0 0 0 1 0 1 0 1 1;
262% 0 1 1 1 0 1 1 0 0 1 1 1];
263% RL = [1 2 3 0 0 0 1 0 1 0 1 2;
264% [0 3 2 1 0 1 2 0 0 1 2 3];
265%
266
267[m,n] = size(input);
268reverse_looking = [zeros(1,m);input.'];
269reverse_looking = reverse_looking(:);
270p = find(~reverse_looking);
271reverse_looking(p) = [0;1-diff(p)];
272reverse_looking = reshape(cumsum(reverse_looking),[],m).';
273reverse_looking(:,1) = [];
274
275
276
diff --git a/test/entropy/don_anal.m b/test/entropy/don_anal.m
new file mode 100644
index 0000000..7d0a648
--- a/dev/null
+++ b/test/entropy/don_anal.m
@@ -0,0 +1,40 @@
1d = fastaread('C:\dondata\FASTA2.fa');
2lens = arrayfun(@(x)(numel(x.Sequence)), d);
3d(lens < 300) = [];
4rexp = '/Human/(\w*)/([\d\w]*)/(.*?)/(\d{4})/';
5headers = cell(length(d), 1);
6[headers{:}] = deal(d(:).Header);
7groups = regexp(headers, rexp, 'tokens', 'once');
8cgroups = cat(1,groups{:});
9dates = unique(cgroups(:,end));
10
11seqs = cell(length(d), 1);
12[seqs{:}] = deal(d(:).Sequence);
13
14ALIGN_CELL = cell(length(dates), 1);
15INDS_CELL = cell(length(dates), 1);
16PAIRWISE_CELL = cell(length(dates),1);
17for i = 1:length(dates)
18 display(dates{i})
19 tf = strcmp(cgroups(:,end), dates{i});
20 if isempty(PAIRWISE_CELL{i})
21 pairwise_dists = GenomePairwiseDist(d(tf), dates{i});
22 PAIRWISE_CELL{i} = pairwise_dists;
23 else
24 pairwise_dists = PAIRWISE_CELL{i};
25 end
26 if isempty(ALIGN_CELL{i}) || isempty(INDS_CELL{i})
27 [ALIGN_CELL{i}, INDS_CELL{i}] = GenomeAlignments(d(tf), ...
28 pairwise_dists);
29 save restore_data PAIRWISE_CELL ALIGN_CELL INDS_CELL
30 end
31end
32
33for i = 1:length(ALIGN_CELL)
34 [ALIGN_CELL{i}, ~]= RefineAlignments(ALIGN_CELL{i});
35end
36
37
38ents = CalculateProteinEntropy(d(1), ALIGN_CELL);
39
40dents = cat(1, ents{:});
diff --git a/test/entropy/nwalign_mod.m b/test/entropy/nwalign_mod.m
new file mode 100644
index 0000000..76aade6
--- a/dev/null
+++ b/test/entropy/nwalign_mod.m
@@ -0,0 +1,637 @@
1function [score, alignment, startat, matrices] = nwalign(seq1,seq2,varargin)
2%NWALIGN performs Needleman-Wunsch global alignment of two sequences.
3%
4% NWALIGN(SEQ1, SEQ2) returns the score (in bits) for the optimal
5% alignment. Note: The scale factor used to calculate the score is
6% provided by the scoring matrix info (see below). If this is not
7% defined, then NWALIGN returns the raw score.
8%
9% [SCORE, ALIGNMENT] = NWALIGN(SEQ1, SEQ2) returns a string showing an
10% optimal global alignment of amino acid (or nucleotide) sequences SEQ1
11% and SEQ2.
12%
13% [SCORE, ALIGNMENT, STARTAT] = NWALIGN(SEQ1, SEQ2) returns a 2x1 vector
14% with the starting point indices indicating the starting point of the
15% alignment in the two sequences. Note: this output is for consistency
16% with SWALIGN and will always be [1;1] because this is a global
17% alignment.
18%
19% NWALIGN(..., 'ALPHABET', A) specifies whether the sequences are
20% amino acids ('AA') or nucleotides ('NT'). The default is AA.
21%
22% NWALIGN(..., 'SCORINGMATRIX', matrix) defines the scoring matrix to be
23% used for the alignment. The default is BLOSUM50 for AA or NUC44 for NT.
24%
25% NWALIGN(..., 'SCALE' ,scale) indicates the scale factor of the scoring
26% matrix to return the score using arbitrary units. If the scoring matrix
27% Info also provides a scale factor, then both are used.
28%
29% NWALIGN(..., 'GAPOPEN', penalty) defines the penalty for opening a gap
30% in the alignment. The default gap open penalty is 8.
31%
32% NWALIGN(..., 'EXTENDGAP', penalty) defines the penalty for extending a
33% gap in the alignment. If EXTENDGAP is not specified, then extensions to
34% gaps are scored with the same value as GAPOPEN.
35%
36% NWALIGN(..., 'SHOWSCORE', true) displays the scoring space and the
37% winning path.
38%
39%
40% Examples:
41%
42% % Return the score in bits and the global alignment using the
43% % default scoring matrix (BLOSUM50).
44% [score, align] = nwalign('VSPAGMASGYD', 'IPGKASYD')
45%
46% % Use user-specified scoring matrix and "gap open" penalty.
47% [score, align] = nwalign('IGRHRYHIGG', 'SRYIGRG',...
48% 'scoringmatrix', @pam250, 'gapopen',5)
49%
50% % Return the score in nat units (nats).
51% [score, align] = nwalign('HEAGAWGHEE', 'PAWHEAE', 'scale', log(2))
52%
53% % Display the scoring space and the winning path.
54% nwalign('VSPAGMASGYD', 'IPGKASYD', 'showscore', true)
55%
56% See also BLOSUM, MULTIALIGN, NT2AA, PAM, PROFALIGN, SEQDOTPLOT,
57% SHOWALIGNMENT, SWALIGN.
58
59% References:
60% R. Durbin, S. Eddy, A. Krogh, and G. Mitchison. Biological Sequence
61% Analysis. Cambridge UP, 1998.
62% Needleman, S. B., Wunsch, C. D., J. Mol. Biol. (1970) 48:443-453
63
64% Copyright 2002-2006 The MathWorks, Inc.
65% $Revision: 1.22.6.10 $ $Date: 2006/05/17 20:48:28 $
66
67gapopen = -8;
68gapextend = -8;
69setGapExtend = false;
70showscore=false;
71isAminoAcid = true;
72scale=1;
73
74% If the input is a structure then extract the Sequence data.
75if isstruct(seq1)
76 try
77 seq1 = seqfromstruct(seq1);
78 catch
79 rethrow(lasterror);
80 end
81end
82if isstruct(seq2)
83 try
84 seq2 = seqfromstruct(seq2);
85 catch
86 rethrow(lasterror);
87 end
88end
89if nargin > 2
90 if rem(nargin,2) == 1
91 error('Bioinfo:IncorrectNumberOfArguments',...
92 'Incorrect number of arguments to %s.',mfilename);
93 end
94 okargs = {'scoringmatrix','gapopen','extendgap','alphabet','scale','showscore'};
95 for j=1:2:nargin-2
96 pname = varargin{j};
97 pval = varargin{j+1};
98 k = strmatch(lower(pname), okargs); %#ok
99 if isempty(k)
100 error('Bioinfo:UnknownParameterName',...
101 'Unknown parameter name: %s.',pname);
102 elseif length(k)>1
103 error('Bioinfo:AmbiguousParameterName',...
104 'Ambiguous parameter name: %s.',pname);
105 else
106 switch(k)
107 case 1 % scoring matrix
108 if isnumeric(pval)
109 ScoringMatrix = pval;
110 else
111 if ischar(pval)
112 pval = lower(pval);
113 end
114 try
115 [ScoringMatrix,ScoringMatrixInfo] = feval(pval);
116 catch
117 error('Bioinfo:InvalidScoringMatrix','Invalid scoring matrix.');
118 end
119 end
120 case 2 %gap open penalty
121 gapopen = -pval;
122 case 3 %gap extend penalty
123 gapextend = -pval;
124 setGapExtend = true;
125 case 4 %if sequence is nucleotide
126 if strcmpi(pval,'nt')
127 isAminoAcid = false;
128 end
129 case 5 % scale
130 scale=pval;
131 case 6 % showscore
132 showscore = pval == true;
133 end
134 end
135 end
136end
137
138% setting the default scoring matrix
139if ~exist('ScoringMatrix','var')
140 if isAminoAcid
141 [ScoringMatrix,ScoringMatrixInfo] = blosum50;
142 else
143 [ScoringMatrix,ScoringMatrixInfo] = nuc44;
144 end
145end
146
147
148% getting the scale from ScoringMatrixInfo, if it exists
149if exist('ScoringMatrixInfo','var') && isfield(ScoringMatrixInfo,'Scale')
150 scale=scale*ScoringMatrixInfo.Scale;
151end
152
153% handle properly "?" characters typically found in pdb files
154if isAminoAcid
155 if ischar(seq1)
156 seq1 = strrep(seq1,'?','X');
157 else
158 seq1(seq1 == 26) = 23;
159 end
160 if ischar(seq2)
161 seq2 = strrep(seq2,'?','X');
162 else
163 seq2(seq2 == 26) = 23;
164 end
165end
166
167% check input sequences
168% if isAminoAcid && ~(isaa(seq1) && isaa(seq2))
169% error('Bioinfo:InvalidAminoAcidSequences',...
170% 'Both sequences must be amino acids, use ALPHABET = ''NT'' for aligning nucleotides.');
171% elseif ~isAminoAcid && ~(isnt(seq1) && isnt(seq2))
172% error('Bioinfo:InvalidNucleotideSequences',...
173% 'When ALPHABET = ''NT'', both sequences must be nucleotides.');
174% end
175
176% use numerical arrays for easy indexing
177if ischar(seq1)
178 seq1=upper(seq1); %the output alignment will be all uppercase
179 if isAminoAcid
180 intseq1 = aa2int(seq1);
181 else
182 intseq1 = nt2int(seq1);
183 end
184else
185 intseq1=seq1;
186 if isAminoAcid
187 seq1 = int2aa(intseq1);
188 else
189 seq1 = int2nt(intseq1);
190 end
191end
192if ischar(seq2)
193 seq2=upper(seq2); %the output alignment will be all uppercase
194 if isAminoAcid
195 intseq2 = aa2int(seq2);
196 else
197 intseq2 = nt2int(seq2);
198 end
199else
200 intseq2=seq2;
201 if isAminoAcid
202 seq2 = int2aa(intseq2);
203 else
204 seq2 = int2nt(intseq2);
205 end
206end
207
208
209m = length(seq1);
210n = length(seq2);
211if ~n||~m
212 error('Bioinfo:InvalidLengthSequences','Length of input sequences must be greater than 0');
213end
214
215% If unknown, ambiguous or gaps appear in the sequence, we need to make
216% sure that ScoringMatrix can handle them.
217
218% possible values are
219% B Z X * - ?
220% 21 22 23 24 25 26
221
222scoringMatrixSize = size(ScoringMatrix,1);
223
224highestVal = max([intseq1, intseq2]);
225if highestVal > scoringMatrixSize
226 % if the matrix contains the 'Any' we map to that
227 if isAminoAcid
228 anyVal = aa2int('X');
229 else
230 anyVal = nt2int('N');
231 end
232 if scoringMatrixSize >= anyVal
233 intseq1(intseq1>scoringMatrixSize) = anyVal;
234 intseq2(intseq2>scoringMatrixSize) = anyVal;
235 else
236 error('Bioinfo:InvalidSymbolsInInputSequeces',...
237 'Sequences contain symbols that cannot be handled by the given scoring matrix.');
238 end
239end
240
241if setGapExtend % call more complicated algorithm if we have
242 [F, pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend);
243else
244% [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen);
245 scoredMatchMat=[ScoringMatrix(intseq2,intseq1(:)) zeros(n,1); zeros(1,m+1)];
246 [F pointer]=FastNWalign2(scoredMatchMat,gapopen);
247end
248
249% trace back through the pointer matrix
250halfn=ceil((n+1)/2);
251halfm=ceil((m+1)/2);
252
253i = n+1; j = m+1;
254path = zeros(n+m,2);
255step = 1;
256
257score = max(F(n+1,m+1,:));
258
259if setGapExtend
260
261 if F(n+1,m+1,3)==score % favor with left-gap
262 laststate=3;
263 elseif F(n+1,m+1,2)==score % then with up-gap
264 laststate=2;
265 else % at last with match
266 laststate=1;
267 end
268
269 while (i>halfn && j>halfm) % in the rigth half favor gaps when several
270 % paths lead to the highest score
271 state=laststate;
272 if bitget(pointer(i,j,state),3)
273 laststate=3;
274 elseif bitget(pointer(i,j,state),2)
275 laststate=2;
276 else
277 laststate=1;
278 end
279
280 switch state
281 case 1 % is diagonal
282 j = j - 1;
283 i = i - 1;
284 path(step,:) = [j,i];
285 case 2 % is up
286 i = i - 1;
287 path(step,2) = i;
288 case 3 % is left
289 j = j - 1;
290 path(step,1) = j;
291 end
292 step = step +1;
293 end
294
295 while (i>1 || j>1) % in the rigth half favor matchs when several
296 % paths lead to the highest score
297 state=laststate;
298 if bitget(pointer(i,j,state),1)
299 laststate=1;
300 elseif bitget(pointer(i,j,state),2)
301 laststate=2;
302 else
303 laststate=3;
304 end
305
306 switch state
307 case 1 % is diagonal
308 j = j - 1;
309 i = i - 1;
310 path(step,:) = [j,i];
311 case 2 % is up
312 i = i - 1;
313 path(step,2) = i;
314 case 3 % is left
315 j = j - 1;
316 path(step,1) = j;
317 end
318 step = step +1;
319 end
320
321else % ~setGapExtend
322
323 while (i > 1 || j > 1)
324
325 switch pointer(i,j)
326 case 1 % diagonal only
327 j = j - 1;
328 i = i - 1;
329 path(step,:) = [j,i];
330 case 2 % up only
331 i = i - 1;
332 path(step,2) = i;
333 case 4 % left only
334 j = j - 1;
335 path(step,1) = j;
336 case 6 % up or left --> up (favors gaps in seq2)
337 j = j - 1;
338 path(step,1) = j;
339 otherwise %3 diagonal or up --> diagonal (favors no gaps)
340 %4 diagonal or left --> diagonal (favors no gaps)
341 %7 diagonal or left or up --> diagonal (favors no gaps)
342 j = j - 1;
343 i = i - 1;
344 path(step,:) = [j,i];
345 end
346 step = step +1;
347 end
348
349end % if setGapExtend
350
351% re-scaling the output score
352score = scale * score;
353
354if nargout<=1 && ~showscore
355 return
356end
357
358path(step:end,:) = []; % step-1 is the length of the alignment
359path = flipud(path);
360
361% setting the size of the alignment
362alignment = repmat(('- -')',1,step-1);
363
364% adding sequence to alignment
365alignment(1,path(:,1)>0) = seq1;
366alignment(3,path(:,2)>0) = seq2;
367
368% find locations where there are no gaps
369h=find(all(path>0,2));
370if isAminoAcid
371 noGaps1=aa2int(alignment(1,h));
372 noGaps2=aa2int(alignment(3,h));
373else
374 noGaps1=nt2int(alignment(1,h));
375 noGaps2=nt2int(alignment(3,h));
376end
377
378% erasing symbols that can not be scored
379htodel=max([noGaps1;noGaps2])>scoringMatrixSize;
380h(htodel)=[];
381noGaps1(htodel)=[];
382noGaps2(htodel)=[];
383% score pairs with no gap
384value = ScoringMatrix(sub2ind(size(ScoringMatrix),double(noGaps1),double(noGaps2)));
385
386% insert symbols of the match string into the alignment
387alignment(2,h(value>=0)) = ':';
388alignment(2,h(noGaps1==noGaps2)) = '|';
389
390startat = [1;1];
391
392% undocumented fourth output -- score and pointer matrices
393if nargout > 3
394 matrices.Scores = F;
395 matrices.Pointers = pointer;
396end
397
398if showscore
399 figure
400 F=scale.*max(F(2:end,2:end,:),[],3);
401 clim=max(max(max(abs(F(~isinf(F))))),eps);
402 imagesc(F,[-clim clim]);
403 colormap(privateColorMap(1));
404 set(colorbar,'YLim',[min([F(:);-eps]) max([F(:);eps])])
405 title('Score for best path')
406 xlabel('Sequence 1')
407 ylabel('Sequence 2')
408 hold on
409 plot(path(all(path>0,2),1),path(all(path>0,2),2),'k.')
410end
411%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
412% function [F, pointer] = simplegap(intseq1,m,intseq2,n,ScoringMatrix,gap)
413% % Standard Needleman-Wunsch algorithm
414%
415% %LOOP IMPLEMENTED IN FastNWalign2.c
416%
417% % set up storage for dynamic programming matrix
418% F = zeros(n+1,m+1);
419% F(2:end,1) = gap * (1:n)';
420% F(1,2:end) = gap * (1:m);
421%
422%
423% %and for the back tracing matrix
424% pointer= repmat(uint8(4),n+1,m+1);
425% pointer(:,1) = 2; % up
426% pointer(1,1) = 1;
427%
428%
429% %initialize buffers to the first column
430% ptr = pointer(:,2); % ptr(1) is always 4
431% currentFColumn = F(:,1);
432%
433%
434% %main loop runs through the matrix looking for maximal scores
435%
436% for outer = 2:m+1
437%
438% %score current column
439% scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
440% %grab the data from the matrices and initialize some values
441% lastFColumn = F(:,outer-1);
442% currentFColumn = F(:,outer);
443% best = currentFColumn(1);
444%
445% %[F(:,outer) pointer(:,outer)]=FastNWalign(lastFColumn,currentFColumn,scoredMatchColumn,gap);
446%
447% for inner = 2:n+1
448% % score the three options
449% up = best + gap;
450% left = lastFColumn(inner) + gap;
451% diagonal = lastFColumn(inner-1) + scoredMatchColumn(inner-1);
452%
453% % max could be used here but it is quicker to use if statements
454% if up > left
455% best = up;
456% pos = 2;
457% else
458% best = left;
459% pos = 4;
460% end
461%
462% if diagonal >= best
463% best = diagonal;
464% ptr(inner) = 1;
465% else
466% ptr(inner) = pos;
467% end
468% currentFColumn(inner) = best;
469%
470% end % inner
471% %put back updated columns
472%
473% % save columns of pointers
474% F(:,outer) = currentFColumn;
475% % save columns of pointers
476% pointer(:,outer) = ptr;
477%
478% end % outer
479%
480% end
481%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
482 function [F,pointer] = affinegap(intseq1,m,intseq2,n,ScoringMatrix,gapopen,gapextend)
483 % Needleman-Wunsch algorithm modified to handle affine gaps
484
485 % Set states
486 inAlign = 1;
487 inGapUp = 2;
488 inGapLeft = 3;
489 numStates = 3;
490
491 % Set up storage for dynamic programming matrix:
492 % for keeping the maximum scores for every state
493
494 F = zeros(n+1,m+1,numStates);
495 F(:,1,:) = -inf;
496 F(1,:,:) = -inf;
497 F(1,1,inAlign) = 0;
498
499 F(2:end,1,inGapUp) = gapopen + gapextend * (0:n-1)';
500 F(1,2:end,inGapLeft) = gapopen + gapextend * (0:m-1);
501
502 % and for the back tracing pointers
503 pointer(n+1,m+1,numStates) = uint8(0);
504 pointer(2:end,1,inGapUp) = 2; % up
505 pointer(1,2:end,inGapLeft) = 4; % left
506
507 % initialize buffers to the first column
508 ptrA = pointer(:,1,inAlign);
509 ptrU = pointer(:,1,inGapLeft);
510 ptrL = pointer(:,1,inGapUp);
511
512 currentFColumnA = F(:,1,inAlign);
513 currentFColumnU = F(:,1,inGapUp);
514 currentFColumnL = F(:,1,inGapLeft);
515
516 % main loop runs through the matrix looking for maximal scores
517 for outer = 2:m+1
518 % score current column
519 scoredMatchColumn = ScoringMatrix(intseq2,intseq1(outer-1));
520 % grab the data from the matrices and initialize some values for the
521 % first row the most orderly possible
522 lastFColumnA = currentFColumnA;
523 currentFColumnA = F(:,outer,inAlign);
524 bestA = currentFColumnA(1);
525 currentinA = lastFColumnA(1);
526
527 lastFColumnU = currentFColumnU;
528 currentFColumnU = F(:,outer,inGapUp);
529 bestU = currentFColumnU(1);
530
531 lastFColumnL = currentFColumnL;
532 currentFColumnL = F(:,outer,inGapLeft);
533 currentinGL = lastFColumnL(1);
534
535 for inner = 2:n+1
536
537 % grab the data from the columns the most orderly possible
538 upOpen = bestA + gapopen;
539 inA = currentinA;
540 currentinA = lastFColumnA(inner);
541 leftOpen = currentinA + gapopen;
542
543 inGL = currentinGL;
544 currentinGL = lastFColumnL(inner);
545 leftExtend = currentinGL + gapextend;
546
547 upExtend = bestU + gapextend;
548 inGU = lastFColumnU(inner-1);
549
550 % operate state 'inGapUp'
551
552 if upOpen > upExtend
553 bestU = upOpen; ptr = 1; % diagonal
554 elseif upOpen < upExtend
555 bestU = upExtend; ptr = 2; % up
556 else % upOpen == upExtend
557 bestU = upOpen; ptr = 3; % diagonal and up
558 end
559 currentFColumnU(inner)=bestU;
560 ptrU(inner)=ptr;
561
562 % operate state 'inGapLeft'
563
564 if leftOpen > leftExtend
565 bestL = leftOpen; ptr = 1; % diagonal
566 elseif leftOpen < leftExtend
567 bestL = leftExtend; ptr = 4; % left
568 else % leftOpen == leftExtend
569 bestL = leftOpen; ptr = 5; % diagonal and left
570 end
571 currentFColumnL(inner) = bestL;
572 ptrL(inner) = ptr;
573
574 % operate state 'inAlign'
575
576 if inA > inGU
577 if inA > inGL
578 bestA = inA; ptr = 1; % diagonal
579 elseif inGL > inA
580 bestA = inGL; ptr = 4; % left
581 else
582 bestA = inA; ptr = 5; % diagonal and left
583 end
584 elseif inGU > inA
585 if inGU > inGL
586 bestA = inGU; ptr = 2; % up
587 elseif inGL > inGU
588 bestA = inGL; ptr = 4; % left
589 else
590 bestA = inGU; ptr = 6; % up & left
591 end
592 else
593 if inA > inGL
594 bestA = inA; ptr = 3; % diagonal & up
595 elseif inGL > inA
596 bestA = inGL; ptr = 4; % left
597 else
598 bestA = inA; ptr = 7; % all
599 end
600 end
601
602 bestA = bestA + scoredMatchColumn(inner-1);
603 currentFColumnA(inner) = bestA;
604 ptrA(inner) = ptr;
605
606 end %inner
607
608 % put back updated columns
609 F(:,outer,inGapLeft) = currentFColumnL;
610 F(:,outer,inGapUp) = currentFColumnU;
611 F(:,outer,inAlign) = currentFColumnA;
612 % save columns of pointers
613 pointer(:,outer,inAlign) = ptrA;
614 pointer(:,outer,inGapUp) = ptrU;
615 pointer(:,outer,inGapLeft)= ptrL;
616 end %outer
617 end
618 function pcmap = privateColorMap(selection)
619 %PRIVATECOLORMAP returns a custom color map
620 switch selection
621 case 1, pts = [0 0 .3 20;
622 0 .1 .8 25;
623 0 .9 .5 15;
624 .9 1 .9 8;
625 1 1 0 26;
626 1 0 0 26;
627 .4 0 0 0];
628 otherwise, pts = [0 0 0 128; 1 1 1 0];
629 end
630 xcl=1;
631 for i=1:size(pts,1)-1
632 xcl=[xcl,i+1/pts(i,4):1/pts(i,4):i+1];
633 end
634 pcmap = interp1(pts(:,1:3),xcl);
635 end
636end
637

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.