function DIST_OUT = GenomePairwiseDist(IN_PROTS, ALIGN_NAME) % GenomePairwiseDist % This will align all of the protein segments in the genome structure % and return a cell-array. Each cell is the alignment of one % protein. If the protein is missing then the distance is NaN. % if nargin < 1 ALIGN_NAME = 'saved_alignments'; end try display('Loading Previous Alignments') d = load(ALIGN_NAME); saved_dists = d.DIST_CELL; saved_headers = d.header_names; catch %#ok display('No previous alignments found') saved_headers = {}; saved_dists = zeros(numel(IN_PROTS)); end NUM_LETTERS = 4000000; [header_names{1:length(IN_PROTS)}] = deal(IN_PROTS(:).Header); display('Starting Alignments') dist_mat = NaN(size(IN_PROTS,1)); [tf loc]=ismember(header_names, saved_headers); dist_mat(tf,tf) = saved_dists(loc(tf), loc(tf)); [I J] = find(triu(isnan(dist_mat))); inds = sub2ind(size(dist_mat), I, J); sizes = arrayfun(@(x)(length(x.Sequence)), IN_PROTS); %find all of the ones that are not worth aligning bad_mask = sizes(I) <= 1 | sizes(J) <= 1; dist_mat(inds(bad_mask)) = Inf; I(bad_mask) = []; J(bad_mask) = []; p_sizes = sizes(I)+sizes(J); counter = 1; display('Aligning...') tic while counter < length(I) cum_sizes = cumsum(p_sizes(counter+1:end)); top = counter+find(cum_sizes dist(k) = Inf; end end time = toc; inds = sub2ind(size(dist_mat), I_spots,J_spots); dist_mat(inds) = dist; counter = top+1; display([num2str(counter/length(I)) ' in ' num2str(time) ' seconds']) end DIST_CELL = dist_mat; save(ALIGN_NAME, 'DIST_CELL', 'header_names') DIST_OUT = dfun(DIST_CELL); function out_d = dfun(MAT) score12 = triu(MAT,1); self_d = diag(MAT); d = (1-bsxfun(@rdivide, score12, self_d(:))).*(1-bsxfun(@rdivide, score12, self_d(:)')); out_d = triu(d,1) + triu(d,1)';