| /* $Id$ |
| * |
| * Christian Iseli, LICR ITO, Christian.Iseli@licr.org |
| * |
| * Copyright (c) 2001-2006 Swiss Institute of Bioinformatics. |
| * Copyright (C) 1998-2001 Liliana Florea. |
| */ |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| #include <limits.h> |
| #include "sim4.h" |
| #include "sim4b1.h" |
| #include "align.h" |
| #include "misc.h" |
| |
| static int snake(uchar *,uchar *,int, int, int, int); |
| static int rsnake(uchar *,uchar *,int, int, int, int, int, int); |
| |
| |
| void |
| align_path(uchar *seq1, uchar *seq2, int i1, int j1, int i2, int j2, int dist, |
| edit_script_p_t *head, edit_script_p_t *tail, int M, int N) |
| { |
| |
| int *last_d, *temp_d, /* forward vectors */ |
| *rlast_d, *rtemp_d; /* backward vectors */ |
| |
| edit_script_p_t head1, tail1, head2, tail2; |
| int midc, rmidc; |
| int start, lower, upper; |
| int rstart, rlower, rupper; |
| int c, k, row; |
| int mi, mj, tmp, ll, uu; |
| char flag; |
| |
| *head = *tail = NULL; |
| |
| /* Boundary cases */ |
| if (i1 == i2) { |
| if (j1 == j2) *head = NULL; |
| else { |
| head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head1->op_type = INSERT; |
| head1->num = j2-j1; |
| head1->next = NULL; |
| *head = *tail = head1; |
| } |
| return; |
| } |
| |
| if (j1 == j2) { |
| head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head1->op_type = DELETE; |
| head1->num = i2-i1; |
| head1->next = NULL; |
| *head = *tail = head1; |
| return; |
| } |
| |
| if (dist <= 1) { |
| start = j1-i1; |
| if (j2-i2 == j1-i1) { |
| head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head1->op_type = SUBSTITUTE; |
| head1->num = i2-i1; |
| head1->next = NULL; |
| *head = *tail = head1; |
| } else if (j2-j1 == i2-i1+1) { |
| |
| tmp = snake(seq1,seq2,start,i1,i2,j2); |
| if (tmp>i1) { |
| head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head1->op_type = SUBSTITUTE; |
| head1->num = tmp-i1; |
| *head = head1; |
| } |
| head2 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head2->op_type = INSERT; |
| head2->num = 1; |
| |
| if (*head) head1->next = head2; |
| else *head = head2; |
| *tail = head2; |
| head2->next = NULL; |
| |
| if (i2-tmp) { |
| head1 = head2; |
| *tail = head2 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head2->op_type = SUBSTITUTE; |
| head2->num = i2-tmp; |
| head2->next = NULL; |
| head1->next = head2; |
| } |
| } else if (j2-j1+1 == i2-i1) { |
| |
| tmp = snake(seq1,seq2,start,i1,i2,j2); |
| if (tmp>i1) { |
| head1 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head1->op_type = SUBSTITUTE; |
| head1->num = tmp-i1; |
| *head = head1; |
| } |
| head2 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head2->op_type = DELETE; |
| head2->num = 1; |
| |
| if (*head) head1->next = head2; |
| else *head = head2; |
| *tail = head2; |
| head2->next = NULL; |
| |
| if (i2>tmp+1) { |
| head1 = head2; |
| *tail = head2 = (edit_script_p_t) xmalloc(sizeof(edit_script_t)); |
| head2->op_type = SUBSTITUTE; |
| head2->num = i2-tmp-1; |
| head2->next = NULL; |
| head1->next = head2; |
| } |
| } else { |
| fprintf(stderr, |
| "align.c: warning: something wrong when aligning."); |
| } |
| return; |
| } |
| |
| /* Divide the problem at the middle cost */ |
| midc = dist/2; |
| rmidc = dist - midc; |
| |
| /* Compute the boundary diagonals */ |
| start = j1 - i1; |
| lower = max(j1-i2, start-midc); |
| upper = min(j2-i1, start+midc); |
| rstart = j2-i2; |
| rlower = max(j1-i2, rstart-rmidc); |
| rupper = min(j2-i1, rstart+rmidc); |
| |
| /* Allocate space for forward vectors */ |
| last_d = (int *)xmalloc((upper-lower+1)*sizeof(int)) - lower; |
| temp_d = (int *)xmalloc((upper-lower+1)*sizeof(int)) - lower; |
| |
| for (k=lower; k<=upper; k++) last_d[k] = -1; |
| last_d[start] = snake(seq1,seq2,start,i1,i2,j2); |
| |
| /* Forward computation */ |
| for (c=1; c<=midc; ++c) { |
| ll = max(lower,start-c); |
| uu = min(upper,start+c); |
| for (k=ll; k<=uu; ++k) { |
| if (k == ll) { |
| /* DELETE : down from (k+1,c-1) */ |
| row = last_d[k+1]+1; |
| } else if (k == uu) { |
| /* INSERT : right from (k-1,c-1) */ |
| row = last_d[k-1]; |
| } else if ((last_d[k]>=last_d[k+1]) && |
| (last_d[k]+1>=last_d[k-1])) { |
| /* SUBSTITUTE */ |
| row = last_d[k]+1; |
| } else if ((last_d[k+1]+1>=last_d[k-1]) && |
| (last_d[k+1]>=last_d[k])) { |
| /* DELETE */ |
| row = last_d[k+1]+1; |
| } else { |
| /* INSERT */ |
| row = last_d[k-1]; |
| } |
| |
| temp_d[k] = snake(seq1,seq2,k,row,i2,j2); |
| } |
| for (k=ll; k<=uu; ++k) |
| last_d[k] = temp_d[k]; |
| } |
| |
| /* Allocate space for backward vectors */ |
| rlast_d = (int *)xmalloc((rupper-rlower+1)*sizeof(int)) - rlower; |
| rtemp_d = (int *)xmalloc((rupper-rlower+1)*sizeof(int)) - rlower; |
| |
| for (k=rlower; k<=rupper; k++) rlast_d[k] = i2+1; |
| rlast_d[rstart] = rsnake(seq1,seq2,rstart,i2,i1,j1,M,N); |
| |
| /* Backward computation */ |
| for (c=1; c<=rmidc; ++c) { |
| ll = max(rlower,rstart-c); |
| uu = min(rupper,rstart+c); |
| for (k=ll; k<=uu; ++k) { |
| if (k == ll) { |
| /* INSERT : left from (k+1,c-1) */ |
| row = rlast_d[k+1]; |
| } else if (k == uu) { |
| /* DELETE : up from (k-1,c-1) */ |
| row = rlast_d[k-1]-1; |
| } else if ((rlast_d[k]-1<=rlast_d[k+1]) && |
| (rlast_d[k]-1<=rlast_d[k-1]-1)) { |
| /* SUBSTITUTE */ |
| row = rlast_d[k]-1; |
| } else if ((rlast_d[k-1]-1<=rlast_d[k+1]) && |
| (rlast_d[k-1]-1<=rlast_d[k]-1)) { |
| /* DELETE */ |
| row = rlast_d[k-1]-1; |
| } else { |
| /* INSERT */ |
| row = rlast_d[k+1]; |
| } |
| |
| rtemp_d[k] = rsnake(seq1,seq2,k,row,i1,j1,M,N); |
| } |
| for (k=ll; k<=uu; ++k) |
| rlast_d[k] = rtemp_d[k]; |
| } |
| |
| /* Find (mi, mj) such that the distance from (i1, j1) to (mi, mj) is |
| midc and the distance from (mi, mj) to (i2, j2) is rmidc. |
| */ |
| |
| flag = 0; |
| mi = i1; mj = j1; |
| ll = max(lower,rlower); uu = min(upper,rupper); |
| for (k=ll; k<=uu; ++k) { |
| if (last_d[k]>=rlast_d[k]) { |
| if (last_d[k]-i1>=i2-rlast_d[k]) { |
| mi = last_d[k]; mj = k+mi; |
| } else { |
| mi = rlast_d[k]; mj = k+mi; |
| } |
| flag = 1; |
| |
| break; |
| } |
| } |
| free(last_d+lower); free(rlast_d+rlower); |
| free(temp_d+lower); free(rtemp_d+rlower); |
| |
| if (flag) { |
| /* Find a path from (i1,j1) to (mi,mj) */ |
| align_path(seq1,seq2,i1,j1,mi,mj,midc,&head1,&tail1,M,N); |
| |
| /* Find a path from (mi,mj) to (i2,j2) */ |
| align_path(seq1,seq2,mi,mj,i2,j2,rmidc,&head2,&tail2,M,N); |
| |
| /* Join these two paths together */ |
| if (head1) tail1->next = head2; |
| else head1 = head2; |
| } else { |
| fprintf(stderr, |
| "align.c: warning: something wrong when dividing\n"); |
| head1 = NULL; |
| } |
| *head = head1; |
| if (head2) *tail = tail2; |
| else *tail = tail1; |
| } |
| |
| |
| int |
| align_get_dist(uchar *seq1, uchar *seq2, int i1, int j1, int i2, int j2, |
| int limit) |
| { |
| int *last_d, *temp_d; |
| int goal_diag, ll, uu; |
| int c, k, row; |
| int start, lower, upper; |
| |
| /* Compute the boundary diagonals */ |
| start = j1 - i1; |
| lower = max(j1-i2, start-limit); |
| upper = min(j2-i1, start+limit); |
| goal_diag = j2-i2; |
| |
| if (goal_diag > upper || goal_diag < lower) |
| return -1; |
| |
| /* Allocate space for forward vectors */ |
| last_d = (int *)xmalloc((upper-lower+1)*sizeof(int)) - lower; |
| temp_d = (int *)xmalloc((upper-lower+1)*sizeof(int)) - lower; |
| |
| /* Initialization */ |
| for (k=lower; k<=upper; ++k) last_d[k] = INT_MIN; |
| last_d[start] = snake(seq1,seq2,start, i1, i2, j2); |
| |
| if (last_d[goal_diag] >= i2) { |
| /* Free working vectors */ |
| free(last_d+lower); |
| free(temp_d+lower); |
| return 0; |
| } |
| |
| for (c=1; c<=limit; ++c) { |
| ll = max(lower,start-c); uu = min(upper, start+c); |
| for (k=ll; k<=uu; ++k) { |
| if (k == ll) |
| row = last_d[k+1]+1; /* DELETE */ |
| else if (k == uu) |
| row = last_d[k-1]; /* INSERT */ |
| else if ((last_d[k]>=last_d[k+1]) && |
| (last_d[k]+1>=last_d[k-1])) |
| row = last_d[k]+1; /*SUBSTITUTE */ |
| else if ((last_d[k+1]+1>=last_d[k-1]) && |
| (last_d[k+1]>=last_d[k])) |
| row = last_d[k+1]+1; /* DELETE */ |
| else |
| row = last_d[k-1]; /* INSERT */ |
| |
| temp_d[k] = snake(seq1,seq2,k,row,i2,j2); |
| } |
| |
| for (k=ll; k<=uu; ++k) last_d[k] = temp_d[k]; |
| |
| if (last_d[goal_diag] >= i2) { |
| /* Free working vectors */ |
| free(last_d+lower); |
| free(temp_d+lower); |
| return c; |
| } |
| } |
| |
| /* Ran out of distance limit */ |
| return -1; |
| } |
| |
| /* Condense_both_Ends -- merge contiguous operations of the same type */ |
| /* together; return both new ends of the chain. */ |
| void Condense_both_Ends (edit_script_p_t *head, edit_script_p_t *tail, |
| edit_script_p_t *prev) |
| { |
| edit_script_p_t tp, tp1; |
| |
| tp = *head; *prev = NULL; |
| while (tp != NULL) { |
| while (((tp1 = tp->next) != NULL) && (tp->op_type == tp1->op_type)) { |
| tp->num = tp->num + tp1->num; |
| tp->next = tp1->next; |
| free(tp1); |
| } |
| if (tp->next) *prev = tp; |
| else *tail = tp; |
| tp = tp->next; |
| } |
| } |
| |
| void S2A(edit_script_p_t head, int *S, int flag) |
| { |
| edit_script_p_t tp; |
| int *lastS, i; |
| |
| tp = head; |
| lastS = S; |
| while (tp != NULL) { |
| /* |
| printf("tp->op_type=%d, tp->num=%d\n",tp->op_type, tp->num); |
| */ |
| if (tp->op_type == SUBSTITUTE) { |
| for (i=0; i<tp->num; ++i) *lastS++ = 0; |
| } else if (tp->op_type == INSERT) { |
| *lastS++ = (!flag) ? tp->num : (0-tp->num); |
| } else { /* DELETE */ |
| *lastS++ = (!flag) ? (0 - tp->num) : tp->num; |
| } |
| tp = tp->next; |
| } |
| *(S-1) = lastS - S; |
| } |
| |
| /* Alignment display routine */ |
| |
| static uchar ALINE[51], BLINE[51], CLINE[51]; |
| |
| static int |
| get_pos_width(collec_p_t eCol) |
| { |
| unsigned int last = eCol->e.exon[eCol->nb - 1]->to1 + options.dnaOffset; |
| unsigned int w = 1; |
| while ((last = last / 10) > 0) |
| w += 1; |
| if (w < 7) |
| w = 7; |
| return w; |
| } |
| |
| void |
| IDISPLAY(uchar *A, uchar *B, unsigned int M, unsigned int N, |
| int *S, unsigned int AP, unsigned int BP, collec_p_t eCol, |
| int direction) |
| { |
| uchar *a, *b, *c, sign; |
| int op, index, starti; |
| unsigned int i, j, lines, ap, bp, pWidth; |
| unsigned int ii = 0; |
| exon_p_t ep; |
| |
| assert(eCol->nb > 0); |
| |
| pWidth = get_pos_width(eCol); |
| /* find the starting exon for this alignment */ |
| while (ii < eCol->nb |
| && ((ep = eCol->e.exon[ii])->from1 != AP || ep->from2 != BP)) |
| ii += 1; |
| if (ii >= eCol->nb) |
| fatal("align.c: Alignment fragment not found.\n"); |
| i = j = op = lines = index = 0; |
| sign = '*'; |
| ap = AP; |
| bp = BP; |
| a = ALINE; |
| b = BLINE; |
| c = CLINE; |
| starti = (ii < eCol->nb - 1) ? (int) ep->to1 + 1 : -1; |
| |
| while (i < M || j < N) { |
| if (op == 0 && *S == 0) { |
| op = *S++; |
| *a = A[++i]; |
| *b = B[++j]; |
| *c++ = (*a++ == *b++) ? '|' : ' '; |
| } else { |
| if (op == 0) |
| op = *S++; |
| if (op > 0) { |
| *a++ = ' '; |
| *b++ = B[++j]; |
| *c++ = '-'; |
| op--; |
| } else { |
| if ((int) (i + AP) == starti) { |
| /* detected intron */ |
| if (ep->type < 0 || direction == 0) |
| sign = '='; |
| else if (direction > 0) |
| sign = '>'; |
| else |
| sign = '<'; |
| ii += 1; |
| ep = (ii < eCol->nb) ? eCol->e.exon[ii] : NULL; |
| starti = (ii < eCol->nb - 1) ? (int) ep->to1 + 1 : -1; |
| index = 1; |
| *c++ = sign; |
| *a++ = A[++i]; |
| *b++ = ' '; |
| op++; |
| } else if (!index) { |
| *c++ = '-'; |
| *a++ = A[++i]; |
| *b++ = ' '; |
| op++; |
| } else { |
| /* not the first deletion in the intron */ |
| switch (index) { |
| case 0: |
| case 1: |
| case 2: |
| *a++ = A[++i]; |
| *b++ = ' '; |
| *c++ = sign; |
| op++; |
| index++; |
| break; |
| case 3: |
| case 4: |
| *a++ = '.'; |
| *b++ = ' '; |
| *c++ = '.'; |
| i++; |
| op++; |
| index++; |
| break; |
| case 5: |
| *a++ = '.'; |
| *b++ = ' '; |
| *c++ = '.'; |
| i += (-op) - 3; |
| op = -3; |
| index++; |
| break; |
| case 6: |
| case 7: |
| *a++ = A[++i]; |
| *b++ = ' '; |
| *c++ = sign; |
| op++; |
| index++; |
| break; |
| case 8: |
| *a++ = A[++i]; |
| *b++ = ' '; |
| *c++ = sign; |
| op++; |
| index = 0; |
| break; |
| } |
| } |
| } |
| } |
| if (a >= ALINE + 50 || (i >= M && j >= N)) { |
| *a = *b = *c = '\0'; |
| printf("\n%*u ", pWidth, 50 * lines++); |
| for (b = ALINE + 10; b <= a; b += 10) |
| printf(" . :"); |
| if (b <= a + 5) |
| printf(" ."); |
| printf("\n%*u %s\n%*s %s\n%*u %s\n", |
| pWidth, ap + options.dnaOffset, ALINE, |
| pWidth, " ", CLINE, |
| pWidth, bp, BLINE); |
| ap = AP + i; |
| bp = BP + j; |
| a = ALINE; |
| b = BLINE; |
| c = CLINE; |
| } |
| } |
| } |
| |
| void |
| Free_script(edit_script_p_t head) |
| { |
| edit_script_p_t tp, tp1; |
| |
| tp = head; |
| while (tp != NULL) { |
| tp1 = tp->next; |
| free(tp); |
| tp = tp1; |
| } |
| } |
| |
| static int |
| snake(uchar *seq1, uchar *seq2, int k, int x, int endx, int endy) |
| { |
| int y; |
| |
| if (x<0) return x; |
| y = x+k; |
| while (x<endx && y<endy && seq1[x]==seq2[y]) { |
| ++x; ++y; |
| } |
| return x; |
| } |
| |
| |
| static int |
| rsnake(uchar *seq1, uchar *seq2, int k, int x, int startx, int starty, |
| int M, int N) |
| { |
| int y; |
| |
| if (x > M) |
| return x; |
| if (startx < 0 || starty < 0) |
| fprintf(stderr, "TROUBLE!!! startx: %5d, starty: %5d\n",startx, starty); |
| if (x + k > N) |
| fprintf(stderr, "TROUBLE!!! x: %5d, y: %5d\n",x,x+k); |
| |
| y = x + k; |
| while (x > startx && y > starty && seq1[x - 1] == seq2[y - 1]) { |
| x -= 1; |
| y -= 1; |
| } |
| return x; |
| } |