blob: dfe0ea9fdf7550691989229290c2dfa01f5163ef [file] [log] [blame]
/* $Id$
*
* Christian Iseli, LICR ITO, Christian.Iseli@licr.org
*
* Copyright (c) 2001-2006 Swiss Institute of Bioinformatics.
* Copyright (C) 1998-2001 Liliana Florea.
*/
#define _GNU_SOURCE 1
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <search.h>
#include <ctype.h>
#include "sim4.h"
#include "sim4b1.h"
#include "align.h"
#include "misc.h"
int encoding[NACHARS];
static void merge(collec_p_t, collec_p_t, unsigned int, unsigned int);
static void slide_intron(result_p_t, uchar *, uchar *);
static void compact_exons(collec_p_t, unsigned int);
static int greedy(uchar *, uchar *, unsigned int, unsigned int,
unsigned int, unsigned int, unsigned int, collec_p_t);
static int extend_bw(uchar *,uchar *,int,int,int,int,int *,int *, int);
static int extend_fw(uchar *,uchar *,int,int,int,int,int *,int *, int);
static int pluri_align(uchar *, uchar *, unsigned int *, collec_p_t,
edit_script_list_p_t *, int, int);
static exon_p_t new_exon(unsigned int, unsigned int,
unsigned int,unsigned int);
static void extend_hit(int, int, hash_env_p_t, const uchar * const,
unsigned int, int, collec_p_t, int *);
static int msp_compare(const void *, const void *);
static int msp_rna_compare(const void *, const void *);
static void search(hash_env_p_t, uchar *,
unsigned int, int, collec_p_t);
static void trim_small_repeated_msps(collec_p_t);
static void combine_msps(collec_p_t);
static int link_msps(collec_p_t, unsigned int, unsigned int);
static void msp2exons(exon_p_t *, int, collec_p_t, int);
static void exon_cores(hash_env_p_t, uchar *, unsigned int, int, int, int,
collec_p_t, collec_p_t, collec_p_t);
static int good_ratio(int, int);
static void swap_seqs(collec_p_t);
static int SWscore(uchar *, uchar *, unsigned int);
#ifdef DEBUG
static void debug_print_exons(collec_p_t, const char *,
const unsigned char *, const unsigned char *);
#endif
static int
is_polyAT_exon_p(exon_p_t e, const unsigned char *s)
{
unsigned int cntA = 0;
unsigned int cntC = 0;
unsigned int cntG = 0;
unsigned int cntT = 0;
unsigned int cntN = 0;
unsigned int i;
unsigned int len = e->to2 - e->from2 + 1;
for (i = e->from2 - 1; i < e->to2; i++)
switch (s[i]) {
case 'A':
cntA += 1;
break;
case 'C':
cntC += 1;
break;
case 'G':
cntG += 1;
break;
case 'T':
cntT += 1;
break;
default:
cntN += 1;
}
len -= cntN;
if (len < MIN_INTRON) {
if ((cntA * 10) / len >= 7
|| ((cntA + cntG) * 10) / len >= 8
|| (cntT * 10) / len >= 7
|| ((cntT + cntC) * 10) / len >= 8)
return 1;
} else {
if ((cntA * 10) / len >= 8
|| ((cntA + cntG) * 100) / len >= 95
|| (cntT * 10) / len >= 8
|| ((cntT + cntC) * 100) / len >= 95)
return 1;
}
return 0;
}
static void
kill_polyA(result_p_t res, const unsigned char *s1, const unsigned char *s2)
{
unsigned int i;
collec_p_t eCol = &res->eCol;
/* Stupid initialization below to avoid spurious uninitialized warning
* from GCC... */
struct {int score; unsigned int cnt; unsigned int d;} best = best;
i = 0;
while (i < eCol->nb && is_polyAT_exon_p(eCol->e.exon[i], s2))
i += 1;
if (i > 0) {
unsigned int j;
for (j = 0; j < i; j++)
free(eCol->e.exon[j]);
memmove(eCol->e.elt, eCol->e.elt + i,
(eCol->nb - i) * sizeof(void *));
eCol->nb -= i;
}
i = 0;
while (i < eCol->nb
&& is_polyAT_exon_p(eCol->e.exon[eCol->nb - i - 1], s2))
i += 1;
if (i > 0) {
unsigned int j;
for (j = eCol->nb - i; j < eCol->nb; j++)
free(eCol->e.exon[j]);
eCol->nb -= i;
}
if (eCol->nb > 0) {
exon_p_t e = eCol->e.exon[eCol->nb - 1];
unsigned int cntAs1 = 0, cntAs2 = 0, j = 0;
int score = 0;
const unsigned char *s = s2 + e->to2;
best.score = 0;
while (*s && best.score - score < 10) {
j += 1;
switch (*s) {
case 'A':
cntAs2 += 1;
score += 1;
if (score > best.score) {
best.score = score;
best.cnt = cntAs2;
best.d = j;
}
break;
case 'N':
break;
default:
score -= 2;
}
s += 1;
}
if (best.score > 0 && best.cnt >= 8 && (best.cnt * 10) / best.d >= 8) {
s = s1 + e->to1;
j = 0;
while (*s && j < best.d) {
j += 1;
if (*s == 'A')
cntAs1 += 1;
s += 1;
}
if (j > 0 && (cntAs1 * 10) / j < 8) {
res->st.polyA_cut = 1;
}
}
}
if (eCol->nb > 0) {
exon_p_t e = eCol->e.exon[0];
unsigned int cntTs1 = 0, cntTs2 = 0, j = 0;
int score = 0;
const unsigned char *s = s2 + e->from2 - 2;
best.score = 0;
while (s >= s2 && best.score - score < 10) {
j += 1;
switch (*s) {
case 'T':
cntTs2 += 1;
score += 1;
if (score > best.score) {
best.score = score;
best.cnt = cntTs2;
best.d = j;
}
break;
case 'N':
break;
default:
score -= 2;
}
s -= 1;
}
if (best.score > 0 && best.cnt >= 8 && (best.cnt * 10) / best.d >= 8) {
s = s1 + e->from1 - 2;
j = 0;
while (s >= s1 && j < best.d) {
j += 1;
if (*s == 'T')
cntTs1 += 1;
s -= 1;
}
if (j > 0 && (cntTs1 * 10) / j < 8) {
res->st.polyT_cut = 1;
}
}
}
}
static void
grow_exon_left(exon_p_t e, uchar *s1, uchar *s2)
{
uchar *p1 = s1 + e->from1 - 2;
uchar *p2 = s2 + e->from2 - 2;
while (p1 >= s1 && p2 >= s2 && *p1 == *p2) {
p1 -= 1;
p2 -= 1;
e->from1 -= 1;
e->from2 -= 1;
}
}
static void
grow_exon_right(exon_p_t e, uchar *s1, unsigned int l1,
uchar *s2, unsigned int l2)
{
while (e->to1 < l1 && e->to2 < l2 && s1[e->to1] == s2[e->to2]) {
e->to1 += 1;
e->to2 += 1;
}
}
/* seq1 = genomic DNA (text); seq2 = cDNA */
void
SIM4(hash_env_p_t he, seq_p_t seq2, collec_p_t res)
{
collec_t mCol;
collec_t tem_eCol;
int align_status;
unsigned int curRes;
if (he->len == 0 || seq2->len == 0)
return;
init_col(&mCol, 5);
/* Compute the distance between two sequences A and B */
exon_cores(he, seq2->seq, seq2->len, 1, 1, options.K, &mCol, res, NULL);
init_col(&tem_eCol, 0);
for (curRes = 0; curRes < res->nb; curRes++) {
result_p_t r = res->e.result[curRes];
collec_p_t eCol = &r->eCol;
sim4_stats_p_t st = &r->st;
#ifdef DEBUG
debug_print_exons(eCol, "LSIS", he->seq, seq2->seq);
#endif
/* Chase down polyA tails. */
st->polyA_cut = 0;
st->polyT_cut = 0;
kill_polyA(r, he->seq, seq2->seq);
#ifdef DEBUG
debug_print_exons(eCol, "LSIS 2", he->seq, seq2->seq);
#endif
if (eCol->nb == 0)
continue;
/* Look at the first exon, and try to extend it backward. */
if (!st->polyT_cut && eCol->e.exon[0]->from2 > 1) {
exon_p_t e = eCol->e.exon[0];
unsigned int i = 0;
if (e->from2 - 1 > (MIN_INTRON << 1)
&& e->from1 - 1 > r->dStart) {
hash_env_t tem_he;
#ifdef DEBUG
fprintf(stderr, "Find new exons (head) %d %d\n", e->from1, e->from2);
#endif
init_hash_env(&tem_he, min(10, he->W), seq2->seq, e->from2 - 1);
bld_table(&tem_he);
exon_cores(&tem_he, he->seq + r->dStart, e->from1 - r->dStart - 1,
1, r->dStart + 1, options.C, &mCol, NULL, &tem_eCol);
free_hash_env(&tem_he);
/* Insert new exons (merging if needed), swaping seqs. */
if (tem_eCol.nb > 0) {
swap_seqs(&tem_eCol);
grow_exon_right(tem_eCol.e.exon[tem_eCol.nb - 1],
he->seq, he->len, seq2->seq, seq2->len);
merge(eCol, &tem_eCol, 0, he->W);
tem_eCol.nb = 0;
e = eCol->e.exon[0];
}
}
while (i < eCol->nb && is_polyAT_exon_p(eCol->e.exon[i], seq2->seq))
i += 1;
if (i > 0) {
unsigned int j;
for (j = 0; j < i; j++)
free(eCol->e.exon[j]);
memmove(eCol->e.elt, eCol->e.elt + i,
(eCol->nb - i) * sizeof(void *));
eCol->nb -= i;
if (eCol->nb == 0)
continue;
e = eCol->e.exon[0];
}
if (e->from2 - 1 > 0) {
int diff = min(e->from2 - 1, MAX_GRINIT >> 1);
int u = min(4 * diff, (int) e->from1 - 1);
int I, J, cost;
#ifdef DEBUG
fprintf(stderr, "extend_bw from %d\n", e->from2);
#endif
cost = extend_bw(seq2->seq + e->from2 - 1 - diff,
he->seq + e->from1 - 1 - u,
diff, u, e->from2 - 1 - diff, e->from1 - 1 - u,
&I, &J, he->W);
#ifdef DEBUG
fprintf(stderr, "extend_bw returned %d, I: %d J: %d\n", cost, I, J);
#endif
if (((int) e->from2 - 1 - I) * options.matchScore
+ cost * options.mismatchScore >= 0) {
e->from2 = I + 1;
e->from1 = J + 1;
}
}
}
/* Look at the last exon, and try to extend it forward. */
if (!st->polyA_cut && eCol->e.exon[eCol->nb - 1]->to2 < seq2->len) {
exon_p_t e = eCol->e.exon[eCol->nb - 1];
unsigned int i = 0;
if (seq2->len - e->to2 > (MIN_INTRON << 1)
&& e->to1 < r->dStart + r->dLen) {
hash_env_t tem_he;
#ifdef DEBUG
fprintf(stderr, "Find new exons (tail) %d %d\n", e->to1, e->to2);
#endif
init_hash_env(&tem_he, min(10, he->W),
seq2->seq + e->to2, seq2->len - e->to2);
bld_table(&tem_he);
exon_cores(&tem_he, he->seq + e->to1, r->dStart + r->dLen - e->to1,
e->to2 + 1, e->to1 + 1, options.C, &mCol, NULL, &tem_eCol);
free_hash_env(&tem_he);
/* Append new exons (merging if needed), swaping seqs. */
if (tem_eCol.nb > 0) {
swap_seqs(&tem_eCol);
grow_exon_left(tem_eCol.e.exon[0], he->seq, seq2->seq);
merge(eCol, &tem_eCol, eCol->nb, he->W);
tem_eCol.nb = 0;
e = eCol->e.exon[eCol->nb - 1];
}
}
while (i < eCol->nb
&& is_polyAT_exon_p(eCol->e.exon[eCol->nb - i - 1], seq2->seq))
i += 1;
if (i > 0) {
unsigned int j;
for (j = eCol->nb - i; j < eCol->nb; j++)
free(eCol->e.exon[j]);
eCol->nb -= i;
if (eCol->nb == 0)
continue;
e = eCol->e.exon[eCol->nb - 1];
}
if (seq2->len - e->to2 > 0) {
int diff = min(seq2->len - e->to2, MAX_GRINIT >> 1);
int cost, I, J;
#ifdef DEBUG
fprintf(stderr, "extend_fw from %d (%d)\n", e->to2, diff);
#endif
cost = extend_fw(seq2->seq + e->to2, he->seq + e->to1, diff,
min(4 * diff, (int) (he->len - e->to1)),
e->to2, e->to1, &I, &J, he->W);
#ifdef DEBUG
fprintf(stderr, "extend_fw returned %d, I: %d J: %d\n", cost, I, J);
#endif
if ((I - (int) e->to2) * options.matchScore
+ cost * options.mismatchScore >= 0) {
e->to2 = I;
e->to1 = J;
}
}
}
/* Proceed in case of several exons. */
if (eCol->nb > 1) {
unsigned int i;
for (i = 1; i < eCol->nb; i++) {
exon_p_t cur = eCol->e.exon[i - 1];
exon_p_t next = eCol->e.exon[i];
int diff = next->from2 - cur->to2 - 1;
if (diff > 0) {
/* bridge the gap (provided there is one...) */
if (next->from1 - 1 > cur->to1) {
hash_env_t tem_he;
if (diff <= MAX_GRINIT) {
int cost;
#ifdef DEBUG
fprintf(stderr, "Trying greedy %d %d\n",
cur->to2, next->from2);
#endif
cost = greedy(seq2->seq + cur->to2, he->seq + cur->to1, diff,
next->from1 - cur->to1 - 1,
cur->to2, cur->to1, he->W, &tem_eCol);
if (tem_eCol.nb > 0
&& cost <= max(he->W, P * diff + 1)) {
grow_exon_left(tem_eCol.e.exon[0], he->seq, seq2->seq);
grow_exon_right(tem_eCol.e.exon[tem_eCol.nb - 1],
he->seq, he->len, seq2->seq, seq2->len);
merge(eCol, &tem_eCol, i, he->W);
tem_eCol.nb = 0;
i -= 1;
continue;
}
}
#ifdef DEBUG
fprintf(stderr, "Find new exons %d %d\n",
cur->to1, next->from1);
#endif
init_hash_env(&tem_he, min(8, he->W), he->seq + cur->to1,
next->from1 - cur->to1 - 1);
bld_table(&tem_he);
exon_cores(&tem_he, seq2->seq + cur->to2, diff, cur->to1 + 1,
cur->to2 + 1, options.C, &mCol, NULL, &tem_eCol);
free_hash_env(&tem_he);
if (tem_eCol.nb > 0) {
grow_exon_left(tem_eCol.e.exon[0], he->seq, seq2->seq);
grow_exon_right(tem_eCol.e.exon[tem_eCol.nb - 1],
he->seq, he->len, seq2->seq, seq2->len);
merge(eCol, &tem_eCol, i, he->W);
tem_eCol.nb = 0;
i -= 1;
}
}
}
}
}
/* Re-check for polyA. */
kill_polyA(r, he->seq, seq2->seq);
/* just printing ... */
#ifdef DEBUG
debug_print_exons(eCol, "EXTENSIONS", he->seq, seq2->seq);
#endif
/* compaction step; note: it resets the right end of the list to */
/* the last item in the block list */
compact_exons(eCol, he->W);
/* just printing ... */
#ifdef DEBUG
debug_print_exons(eCol, "NORMALIZATION", he->seq, seq2->seq);
#endif
/* eliminate marginal small blocks at the start of the sequence; */
if (eCol->nb > 0) {
unsigned int i = 0;
while (i < eCol->nb) {
exon_p_t e = eCol->e.exon[i];
if (e->to2 - e->from2 + 1 >= he->W)
break;
free(e);
i += 1;
}
if (i > 0) {
memmove(eCol->e.elt, eCol->e.elt + i,
(eCol->nb - i) * sizeof(void *));
eCol->nb -= i;
}
}
/* eliminate marginal small blocks at the end of the sequence */
if (eCol->nb > 0) {
int i = eCol->nb - 1;
while (i >= 0) {
exon_p_t e = eCol->e.exon[i];
if (e->to2 - e->from2 + 1 >= he->W)
break;
free(e);
i -= 1;
eCol->nb -= 1;
}
}
/* Slide exon boundaries for optimal intron signals */
slide_intron(r, he->seq, seq2->seq);
/* */
align_status = pluri_align(he->seq, seq2->seq, &(st->nmatches), eCol,
&r->sList, he->len, seq2->len);
if (align_status != 0 || !options.ali_flag) {
free_align(r->sList);
r->sList = NULL;
}
}
free(mCol.e.elt);
free(tem_eCol.e.elt);
}
void
init_col(collec_p_t c, unsigned int size)
{
c->size = size;
c->nb = 0;
if (size > 0)
c->e.elt = (void **) xmalloc(size * sizeof(void *));
else
c->e.elt = NULL;
}
static void
add_col_elt(collec_p_t c, void *elt)
{
if (c->size <= c->nb) {
c->size += 5;
c->e.elt = (void **) xrealloc(c->e.elt, c->size * sizeof(void *));
}
c->e.elt[c->nb++] = elt;
}
#ifdef DEBUG
static void
debug_msps(hash_env_p_t he, uchar *s2, collec_p_t mCol, char *title)
{
unsigned int j;
fputs(title, stderr);
for (j = 0; j < mCol->nb; ++j) {
exon_p_t m = mCol->e.exon[j];
fprintf(stderr, "[%d] %d-%d %d-%d, %d %d\n", j,
m->from1, m->to1, m->from2, m->to2, m->score, m->Score);
}
if (he == NULL)
return;
for (j = 0; j < mCol->nb; ++j) {
exon_p_t m = mCol->e.exon[j];
fprintf(stderr, "%.10s %.*s %.10s\n%.10s %.*s %.10s\n",
(m->from1 >= 10)
? he->seq + m->from1 - 10
: he->seq,
m->to1 - m->from1 + 1, he->seq + m->from1,
he->seq + m->to1 + 1,
(m->from2 >= 10)
? s2 + m->from2 - 10
: s2,
m->to2 - m->from2 + 1, s2 + m->from2,
s2 + m->to2 + 1);
}
}
static void
debug_organized_msps(collec_p_t mCol, int last_msp, char *title)
{
int i;
fputs(title, stderr);
for (i = last_msp; i >= 0; i = mCol->e.exon[i]->prev) {
exon_p_t m = mCol->e.exon[i];
fprintf(stderr, "[%d] %d-%d %d-%d, %d %d\n", i,
m->from1, m->to1, m->from2, m->to2, m->score, m->Score);
}
}
#endif
static void
exon_cores(hash_env_p_t he, uchar *s2, unsigned int len2,
int offset1, int offset2, int K,
collec_p_t mCol, collec_p_t res, collec_p_t eCol)
{
unsigned int j;
int last_msp;
int swapped = eCol != NULL; /* True when sequences were swapped. */
search(he, s2, len2, K, mCol);
#ifdef DEBUG
debug_msps(he, s2, mCol, "==== unsorted MSPs\n");
#endif
/* Kill small repeated segments. */
qsort(mCol->e.exon, mCol->nb, sizeof(exon_p_t), msp_rna_compare);
trim_small_repeated_msps(mCol);
#ifdef DEBUG
debug_msps(he, s2, mCol, "==== sorted MSPs\n");
#endif
/* sort in order of mp->pos1. */
qsort(mCol->e.exon, mCol->nb, sizeof(exon_p_t), msp_compare);
combine_msps(mCol);
#ifdef DEBUG
debug_msps(NULL, NULL, mCol, "==== sorted, combined MSPs\n");
#endif
/* Check for duplicated genes if requested. */
if (eCol == NULL) {
result_p_t r;
unsigned int minMPos = len2;
unsigned int maxMPos = 0;
unsigned int cov;
unsigned int globScore;
int tested = 0;
assert(res != NULL);
for (j = 0; j < mCol->nb; j++) {
exon_p_t m = mCol->e.exon[j];
if (m->from2 < minMPos)
minMPos = m->from2;
if (m->to2 > maxMPos)
maxMPos = m->to2;
}
cov = maxMPos - minMPos + 1;
cov = cov / 4;
minMPos += cov;
if (maxMPos > cov)
maxMPos -= cov;
for (j = 0; j < mCol->nb; j++) {
exon_p_t m = mCol->e.exon[j];
m->bot = m->from2 < minMPos;
m->top = m->to2 > maxMPos;
}
#ifdef DEBUG
fprintf(stderr, "==== top, max: %d\n", maxMPos);
for (j = 0; j < mCol->nb; ++j) {
exon_p_t m = mCol->e.exon[j];
fprintf(stderr, "[%d] %d-%d %d-%d, %d %d, %d\n", j,
m->from1, m->to1, m->from2, m->to2,
m->score, m->Score, m->top);
}
#endif
last_msp = link_msps(mCol, 0, mCol->nb);
if (last_msp < 0)
return;
globScore = mCol->e.exon[last_msp]->Score;
/* When filtering above 50%, check that both pieces have good scores.
When filtering below 50%, check that one of the pieces has 75%. */
if (options.filterPct >= 50) {
globScore = globScore * options.filterPct / 100;
} else {
globScore -= globScore / 4;
}
/* See if we have split points, and if the parts have good scores. */
minMPos = 0;
maxMPos = 0;
for (j = 1; j < mCol->nb; j++) {
exon_p_t p = mCol->e.exon[j - 1];
exon_p_t m = mCol->e.exon[j];
if ((p->top && !m->top)
|| (!p->bot && m->bot)
|| (p->top && m->bot)) {
/* We have a split. */
int lLast;
unsigned int lScore, rScore;
tested = 1;
lLast = link_msps(mCol, minMPos, j);
assert(lLast >= 0);
lScore = mCol->e.exon[lLast]->Score;
last_msp = link_msps(mCol, j, mCol->nb);
assert(last_msp >= 0);
rScore = mCol->e.exon[last_msp]->Score;
#ifdef DEBUG
fprintf(stderr,
"glob: %d, l: %d, r: %d, minP: %d, maxP: %d, j: %d\n",
globScore, lScore, rScore, minMPos, maxMPos, j);
#endif
if ((options.filterPct >= 50
&& lScore >= globScore
&& rScore >= globScore)
|| (options.filterPct < 50
&& (lScore >= globScore || rScore >= globScore))) {
unsigned int k;
/* Good split. Store it for processing. */
add_col_elt(res, xcalloc(1, sizeof(result_t)));
r = res->e.result[res->nb - 1];
r->dStart = maxMPos;
r->dLen = m->from1 - maxMPos;
eCol = &r->eCol;
#ifdef DEBUG
fprintf(stderr, "dStart: %u, dLen: %u\n", r->dStart, r->dLen);
debug_organized_msps(mCol, lLast, "==== organized MSPs (part)\n");
#endif
init_col(eCol, j - minMPos);
msp2exons(mCol->e.exon, lLast, eCol, 0);
for (k = 0; k < eCol->nb; k++) {
exon_p_t e = eCol->e.exon[k];
e->to1 += offset1;
e->from1 += offset1;
e->to2 += offset2;
e->from2 += offset2;
}
minMPos = j;
maxMPos = mCol->e.exon[lLast]->to1;
tested = 0;
}
}
}
if (tested)
last_msp = link_msps(mCol, minMPos, mCol->nb);
add_col_elt(res, xcalloc(1, sizeof(result_t)));
r = res->e.result[res->nb - 1];
r->dStart = maxMPos;
r->dLen = he->len - maxMPos;
#ifdef DEBUG
fprintf(stderr, "dStart: %u, dLen: %u\n", r->dStart, r->dLen);
#endif
eCol = &r->eCol;
} else
last_msp = link_msps(mCol, 0, mCol->nb);
/* organize Blast hits (MSPs) into exons */
#ifdef DEBUG
debug_organized_msps(mCol, last_msp, "==== organized MSPs\n");
#endif
if (eCol->size == 0)
init_col(eCol, mCol->nb);
msp2exons(mCol->e.exon, last_msp, eCol, swapped);
for (j = 0; j < eCol->nb; j++) {
exon_p_t e = eCol->e.exon[j];
e->to1 += offset1;
e->from1 += offset1;
e->to2 += offset2;
e->from2 += offset2;
}
mCol->nb = 0;
}
static inline int
lies_after_p(exon_p_t a, exon_p_t b)
{
/* When we have some overlap, make sure it is only a small part. */
/* ------------------
---------------------
| p1 | p2 | p3 | */
if (b->from1 > a->to1) {
unsigned int p1;
unsigned int p2;
unsigned int p3;
if (b->from2 > a->to2)
return 1;
if (b->from2 < a->from2 || b->to2 < a->to2)
return 0;
p1 = b->from2 - a->from2;
p2 = a->to2 - b->from2;
p3 = b->to2 - a->to2;
if (p1 > p2 && p3 > p2 && p1 > options.K && p3 > options.K)
return 1;
} else if (b->from2 > a->to2) {
unsigned int p1;
unsigned int p2;
unsigned int p3;
if (b->from1 < a->from1 || b->to1 < a->to1)
return 0;
p1 = b->from1 - a->from1;
p2 = a->to1 - b->from1;
p3 = b->to1 - a->to1;
if (p1 > p2 && p3 > p2 && p1 > options.K && p3 > options.K)
return 1;
}
return 0;
}
#define SMALL_EXON 50
#define MIN_REPEAT 20
#define JITTER_FACTOR 5
static void
trim_small_repeated_msps(collec_p_t mCol)
{
unsigned int i = 0;
while (i < mCol->nb) {
exon_p_t m = mCol->e.exon[i];
unsigned int j, k, end;
if (m->to2 - m->from2 >= SMALL_EXON) {
i += 1;
continue;
}
end = m->to2 + JITTER_FACTOR;
j = i + 1;
while (j < mCol->nb && mCol->e.exon[j]->to2 <= end)
j += 1;
if (j - i < MIN_REPEAT) {
i += 1;
continue;
}
for (k = i; k < j; k++)
free(mCol->e.exon[k]);
memmove(mCol->e.exon + i, mCol->e.exon + j,
(mCol->nb - j) * sizeof(exon_p_t));
mCol->nb -= (j - i);
}
}
static void
combine_msps(collec_p_t mCol)
{
unsigned int i = 0;
while (i < mCol->nb) {
exon_p_t m = mCol->e.exon[i];
unsigned int ovl = 0;
unsigned int j;
for (j = i + 1; j < mCol->nb; j++) {
exon_p_t n = mCol->e.exon[j];
unsigned int o = 0;
if (n->from2 <= m->to2 + 1)
ovl = m->to2 - n->from2 + 2;
if (n->from1 > m->from1
&& n->from1 <= m->to1 + 1)
o = m->to1 - n->from1 + 2;
if ((ovl == 0) == (o == 0)
&& abs((int) ovl - (int) o) <= 10)
break;
ovl = 0;
}
if (ovl != 0) {
exon_p_t n = mCol->e.exon[j];
unsigned int nScore = m->score + n->score;
if (nScore >= ovl + 1)
nScore -= ovl + 1;
else
nScore = 0;
m->from1 = min(m->from1, n->from1);
m->from2 = min(m->from2, n->from2);
m->to1 = max(m->to1, n->to1);
m->to2 = max(m->to2, n->to2);
if (nScore > m->score)
m->score = nScore;
mCol->nb -= 1;
free(n);
memmove(mCol->e.exon + j, mCol->e.exon + j + 1,
(mCol->nb - j) * sizeof(exon_p_t));
} else
i += 1;
}
}
static int
link_msps(collec_p_t mCol, unsigned int start, unsigned int stop)
{
struct {
unsigned int elt;
unsigned int score;
} best;
unsigned int i;
if (start >= stop)
return -1;
memset(&best, 0, sizeof(best));
for (i = start; i < stop; i++) {
exon_p_t m = mCol->e.exon[i];
m->Score = 0;
m->prev = -1;
}
for (i = start; i < stop; i++) {
exon_p_t m = mCol->e.exon[i];
unsigned int j;
m->Score += m->score;
if (m->Score > best.score) {
best.score = m->Score;
best.elt = i;
}
for (j = i + 1; j < stop; j++) {
exon_p_t n = mCol->e.exon[j];
if (lies_after_p(m, n) && m->Score >= n->Score) {
unsigned int penalty;
penalty = abs(n->from1 - m->from1) >> 15;
penalty += abs(n->from2 - m->from2) >> 15;
if (penalty < m->Score) {
n->Score = m->Score - penalty;
n->prev = i;
}
}
}
}
return best.elt;
}
void
init_encoding(void)
{
unsigned int i;
for (i = 0; i < NACHARS; i++)
encoding[i] = 4;
encoding['A'] = 0;
encoding['C'] = 1;
encoding['G'] = 2;
encoding['T'] = 3;
}
void
init_hash_env(hash_env_p_t he, unsigned int W, uchar *seq, unsigned int len)
{
he->W = W;
he->seq = seq;
he->len = len;
he->mask = (1 << (W + W - 2)) - 1;
he->next_pos = (int *) xmalloc((len + 1) * sizeof(int));
he->hashtab = (void **)
xcalloc(HASH_SIZE, sizeof(void *));
}
#ifndef __GLIBC__
void
tdestroy(void *VROOT, void(*FREEFCT)(void *))
{
}
#endif
void
free_hash_env(hash_env_p_t he)
{
unsigned int hval;
free(he->next_pos);
for (hval = 0; hval < HASH_SIZE; hval++) {
tdestroy(he->hashtab[hval], free);
}
free(he->hashtab);
}
static int
hash_node_compare(const void *a, const void *b)
{
const hash_node_p_t ha = (hash_node_p_t) a, hb = (hash_node_p_t)b;
if (ha->ecode < hb->ecode)
return -1;
if (ha->ecode > hb->ecode)
return 1;
return 0;
}
/* add_word - add a word to the table of critical words */
static inline void
add_word(hash_env_p_t he, unsigned int ecode, unsigned int pos)
{
hash_node_p_t h = (hash_node_p_t) xmalloc(sizeof(hash_node_t));
hash_node_p_t *key;
h->ecode = ecode;
key = tsearch(h, he->hashtab + (ecode & HASH_MASK), hash_node_compare);
assert(key != NULL);
if (*key != h) {
free(h);
he->next_pos[pos] = (*key)->pos;
} else {
he->next_pos[pos] = -1;
}
(*key)->pos = pos;
}
/* ----------- build table of W-tuples in one of the sequences ------------*/
void
bld_table(hash_env_p_t he)
{
unsigned int ecode;
unsigned int i = 0;
uchar *t;
/* skip any word containing an N/X */
t = he->seq;
while (i < he->len) {
unsigned int j;
restart:
ecode = 0;
for (j = 1; j < he->W && i < he->len; j++) {
unsigned int tmp = encoding[*t++];
i += 1;
if (tmp > 3) goto restart;
ecode = (ecode << 2) + tmp;
}
while (i < he->len) {
unsigned int tmp = encoding[*t++];
i += 1;
if (tmp > 3) goto restart;
ecode = ((ecode & he->mask) << 2) + tmp;
add_word(he, ecode, i);
}
}
}
/* ----------------------- search the other sequence ---------------------*/
static void
search(hash_env_p_t he, uchar *s2, unsigned int len2, int K,
collec_p_t mCol)
{
uchar *t;
unsigned int i = 0;
int *allocated = xcalloc(he->len + len2 + 1, sizeof(int));
int *diag_lev = allocated + he->len;
t = s2;
while (i < len2) {
unsigned int j;
hash_node_t hn;
restart:
hn.ecode = 0;
for (j = 1; j < he->W && i < len2; j++) {
unsigned int tmp = encoding[*t++];
i += 1;
if (tmp > 3) goto restart;
hn.ecode = (hn.ecode << 2) + tmp;
}
while (i < len2) {
unsigned int tmp = encoding[*t++];
hash_node_p_t *key;
i += 1;
if (tmp > 3) goto restart;
hn.ecode = ((hn.ecode & he->mask) << 2) + tmp;
key = tfind(&hn, he->hashtab + (hn.ecode & HASH_MASK),
hash_node_compare);
if (key != NULL) {
int p;
for (p = (*key)->pos; p >= 0; p = he->next_pos[p])
extend_hit(p, i, he, s2, len2, K, mCol, diag_lev);
}
}
}
free(allocated);
}
/* extend_hit - extend a word-sized hit to a longer match */
static void
extend_hit(int pos1, int pos2, hash_env_p_t he, const uchar * const s2,
unsigned int len2, int K, collec_p_t mCol, int *diag_lev)
{
const uchar *beg2, *beg1, *end1, *q, *s;
int right_sum, left_sum, sum, diag, score;
diag = pos2 - pos1;
if (diag_lev[diag] > pos1)
return;
/* extend to the right */
left_sum = sum = 0;
q = he->seq + pos1;
s = s2 + pos2;
end1 = q;
while (s < s2 + len2
&& q < he->seq + he->len
&& sum >= left_sum - options.X) {
sum += ((*s++ == *q++)
? options.matchScore
: options.mismatchScore);
if (sum > left_sum) {
left_sum = sum;
end1 = q;
}
}
/* extend to the left */
right_sum = sum = 0;
beg1 = q = (he->seq + pos1) - he->W;
beg2 = s = (s2 + pos2) - he->W;
while ((s > s2) && (q > he->seq) && sum >= right_sum - options.X) {
sum += ((*(--s) == *(--q))
? options.matchScore
: options.mismatchScore);
if (sum > right_sum) {
right_sum = sum;
beg2 = s;
beg1 = q;
}
}
score = he->W + left_sum + right_sum;
if (score >= K) {
add_col_elt(mCol,
new_exon(beg1 - he->seq, beg2 - s2,
end1 - he->seq - 1, beg2 - s2 + end1 - beg1 - 1));
mCol->e.exon[mCol->nb - 1]->score = score;
}
diag_lev[diag] = (end1 - he->seq) + he->W;
}
/* ---------------------------- sort the MSPs ----------------------------*/
/* msp_compare - determine ordering relationship between two MSPs */
static int
msp_compare(const void *a, const void *b)
{
exon_p_t ki = * (exon_p_t *) a, kj = * (exon_p_t *) b;
if (ki->from1 > kj->from1)
return 1;
if (ki->from1 < kj->from1)
return -1;
if (ki->from2 > kj->from2)
return 1;
if (ki->from2 < kj->from2)
return -1;
return 0;
}
/* msp_rna_compare - determine RNA ordering relationship between two MSPs */
static int
msp_rna_compare(const void *a, const void *b)
{
exon_p_t ki = * (exon_p_t *) a, kj = * (exon_p_t *) b;
if (ki->from2 > kj->from2)
return 1;
if (ki->from2 < kj->from2)
return -1;
if (ki->to2 > kj->to2)
return -1;
if (ki->to2 < kj->to2)
return 1;
return 0;
}
/* --------------------- organize the MSPs into exons ---------------------*/
static void
msp2exons(exon_p_t *msp, int last_msp, collec_p_t eCol, int swapped)
{
while (last_msp >= 0) {
exon_p_t mp = msp[last_msp];
if (eCol->nb > 0) {
/* See if we merge with next exon (we go in reverse). */
exon_p_t next = eCol->e.exon[eCol->nb - 1];
if (!swapped
&& next->from1 < mp->to1 + MIN_INTRON
&& next->from2 > mp->to2 - 1) {
next->to1 = max(next->to1, mp->to1);
next->to2 = max(next->to2, mp->to2);
next->from1 = min(next->from1, mp->from1);
next->from2 = min(next->from2, mp->from2);
last_msp = mp->prev;
free(mp);
continue;
}
}
add_col_elt(eCol, mp);
last_msp = mp->prev;
}
/* Now, need to reverse the exons... */
if (eCol->nb > 1) {
unsigned int i, j;
for (i = 0, j = eCol->nb - 1; j > i; i++, j--) {
exon_p_t e = eCol->e.exon[i];
eCol->e.exon[i] = eCol->e.exon[j];
eCol->e.exon[j] = e;
}
}
}
/* ---------------------- print endpoints of exons --------------------*/
void
print_exons(collec_p_t eCol, int direction)
{
unsigned int i;
unsigned int last = eCol->nb - 1;
exon_p_t cur;
assert(eCol->nb > 0);
for (i = 0; i < last; i++) {
cur = eCol->e.exon[i];
if (direction == 0 || cur->type < 0)
printf("%u-%u (%u-%u) %u%% ==\n",
cur->from1 + options.dnaOffset, cur->to1 + options.dnaOffset,
cur->from2, cur->to2, cur->score);
else
printf("%u-%u (%u-%u) %u%% %s (%.2s/%.2s) %u\n",
cur->from1 + options.dnaOffset, cur->to1 + options.dnaOffset,
cur->from2, cur->to2, cur->score,
direction > 0 ? "->" : "<-",
options.splice[cur->type].fwd,
options.splice[cur->type].fwd + 2,
cur->splScore);
}
cur = eCol->e.exon[last];
printf("%u-%u (%u-%u) %u%%\n",
cur->from1 + options.dnaOffset, cur->to1 + options.dnaOffset,
cur->from2, cur->to2, cur->score);
}
static int
pluri_align(uchar *seq1, uchar *seq2, unsigned int *num_matches,
collec_p_t eCol, edit_script_list_p_t *Aligns, int M, int N)
{
exon_t eFake;
exon_p_t cur = &eFake;
int diff, end1, end2, ali_dist;
unsigned int nmatches = 0;
edit_script_p_t head;
int ii;
head = NULL;
*Aligns = NULL;
ali_dist = 0;
end1 = M;
end2 = N;
eFake.from1 = M + 1;
eFake.from2 = N + 1;
eFake.to1 = 0;
eFake.to2 = 0;
for (ii = eCol->nb - 1; ii >= 0; ii--) {
exon_p_t prev = eCol->e.exon[ii];
edit_script_p_t left, right, prevE, tmp_script;
uchar *a, *b;
int tmpi, di_count, alen;
if ((diff = cur->from2 - prev->to2 - 1) != 0) {
if (cur->to1) {
edit_script_list_p_t
enew = (edit_script_list_p_t) xmalloc(sizeof(edit_script_list_t));
enew->next_script = *Aligns;
*Aligns = enew;
enew->script = head;
enew->offset1 = cur->from1;
enew->offset2 = cur->from2;
enew->len1 = end1 - enew->offset1 + 1;
enew->len2 = end2 - enew->offset2 + 1;
enew->score = ali_dist;
ali_dist = 0;
head = NULL;
}
end1 = prev->to1;
end2 = prev->to2;
} else if ((diff = cur->from1 - prev->to1 - 1) != 0
&& cur->to1) {
edit_script_p_t new = (edit_script_p_t) xmalloc(sizeof(edit_script_t));
new->op_type = DELETE;
new->num = diff;
new->next = head;
head = new;
} else if (diff)
end1 = prev->to1;
diff = align_get_dist(seq1, seq2, prev->from1 - 1, prev->from2 - 1,
prev->to1, prev->to2,
max(1000, .2 * (prev->to2 - prev->from2 + 1)));
if (diff < 0)
return -1;
align_path(seq1, seq2, prev->from1 - 1, prev->from2 - 1,
prev->to1, prev->to2, diff, &left, &right, M, N);
if (right == NULL)
return -1;
Condense_both_Ends(&left, &right, &prevE);
if (!cur->to1 && right->op_type == DELETE) {
/* remove gaps at end of alignment */
diff -= 0 + right->num; /* subtract GAP_OPEN = 0 */
prev->to1 -= right->num;
end1 -= right->num;
if (head && (head->op_type == DELETE))
head->num += right->num;
free(right);
prevE->next = NULL;
right = prevE;
}
if (ii == 0 && left && (left->op_type == DELETE)) {
diff -= 0 + left->num; /* subtract GAP_OPEN = 0 */
prev->from1 += left->num;
tmp_script = left->next;
if (right == left)
right = tmp_script;
free(left);
left = tmp_script;
}
ali_dist += diff;
a = seq1 + prev->from1 - 1;
b = seq2 + prev->from2 - 1;
tmpi = di_count = 0;
tmp_script = left;
while (tmp_script) {
switch (tmp_script->op_type) {
case DELETE:
di_count += tmp_script->num;
tmpi += tmp_script->num;
a += tmp_script->num;
break;
case INSERT:
di_count += tmp_script->num;
tmpi += tmp_script->num;
b += tmp_script->num;
break;
case SUBSTITUTE:
{
int j;
for (j = 0; j < tmp_script->num; ++j, ++a, ++b)
if (*a != *b)
tmpi++;
else
nmatches++;
break;
}
}
tmp_script = tmp_script->next;
}
alen = (int) ((prev->to1 - prev->from1 + 1
+ prev->to2 - prev->from2 + 1 + di_count)
/ (double) 2);
prev->score = ((alen - tmpi) * 100) / alen;
right->next = head;
head = left;
cur = prev;
}
/* at the beginning of the sequences */
if ((diff = cur->from2 - 1) != 0 && diff != N) {
edit_script_list_p_t
enew = (edit_script_list_p_t) xmalloc(sizeof(edit_script_list_t));
enew->next_script = *Aligns;
*Aligns = enew;
enew->offset1 = cur->from1;
enew->offset2 = cur->from2;
enew->len1 = end1 - enew->offset1 + 1;
enew->len2 = end2 - enew->offset2 + 1;
enew->script = head;
enew->score = ali_dist;
} else if (diff != N) {
/* modified to cut introns at the beginning of the sequence */
edit_script_list_p_t
enew = (edit_script_list_p_t) xmalloc(sizeof(edit_script_list_t));
enew->next_script = *Aligns;
*Aligns = enew;
enew->offset1 = cur->from1;
enew->offset2 = 1;
enew->len1 = end1 - enew->offset1 + 1;
enew->len2 = end2 - enew->offset2 + 1;
enew->script = head;
enew->score = ali_dist;
}
*num_matches = nmatches;
return 0;
}
static exon_p_t
new_exon(unsigned int f1, unsigned int f2, unsigned int t1, unsigned int t2)
{
exon_p_t e = (exon_p_t) xmalloc(sizeof(exon_t));
e->from1 = f1;
e->from2 = f2;
e->to1 = t1;
e->to2 = t2;
return e;
}
/* FIXME: why are s1 and s2 reversed here, wrt SIM4 ??? */
static int
greedy(uchar *s1, uchar *s2, unsigned int m, unsigned int n,
unsigned int offset1, unsigned int offset2, unsigned int W,
collec_p_t eCol)
{
int col, /* column number */
k, /* current diagonal */
blower,flower, /* boundaries for searching diagonals */
bupper,fupper,
row, /* row number */
DELTA, /* n-m */
B_ORIGIN, F_ORIGIN;
unsigned int d, /* current distance */
max_d, /* bound on size of edit script */
Cost,
MAX_D,
i;
int back, forth; /* backward and forward limits at exit */
int *blast_d, *flast_d, /* rows containing the last d (at crt step, d-1) */
*btemp_d, *ftemp_d; /* rows containing tmp values for the last d */
int *min_row, *min_diag, /* min (b)/ max (f) row (and diagonal) */
*max_row, *max_diag; /* reached for cost d=0, ... m. */
/* No point trying to span megabase-sized holes... */
if (n >= 1000000)
return 0;
DELTA = n - m;
/*max_d = MAX_D = m+1; */
max_d = MAX_D = max(W, (unsigned int) (P * m + 1));
if (DELTA < 0) {
if (m <= min(W, (1 + P) * n)) {
add_col_elt(eCol,
new_exon(offset2 + 1, offset1 + 1,
offset2 + n, offset1 + m));
return m - n + (unsigned int) (P * n + 1);
} else {
return max(W, (unsigned int) (P * m + 1)) + 1;
}
}
F_ORIGIN = MAX_D;
B_ORIGIN = MAX_D - DELTA;
for (row = m, col = n;
row > 0 && col > 0 && (s1[row - 1] == s2[col - 1]);
row--,col--)
/*LINTED empty loop body*/;
if (row == 0) {
/* hit last row; stop search */
add_col_elt(eCol,
new_exon(offset2 - m + n + 1, offset1 + 1,
offset2 + n, offset1 + m));
return 0;
}
blast_d = (int *) xmalloc((MAX_D + n + 1) * sizeof(int));
btemp_d = (int *) xmalloc((MAX_D + n + 1) * sizeof(int));
for (i = 0; i <= MAX_D + n; ++i) {
blast_d[i] = m + 1;
btemp_d[i] = m + 1;
}
blast_d[B_ORIGIN + DELTA] = row;
blower = B_ORIGIN + DELTA - 1;
bupper = B_ORIGIN + DELTA + 1;
for (row = 0;
(unsigned int) row < n
&& (unsigned int) row < m
&& (s1[row] == s2[row]);
row++)
/*LINTED empty loop body*/;
if ((unsigned int) row == m) {
/* hit last row; stop search */
add_col_elt(eCol,
new_exon(offset2 + 1, offset1 + 1,
offset2 + m, offset1 + m));
free(blast_d);
free(btemp_d);
return 0;
}
flast_d = (int *) xmalloc((MAX_D + n + 1) * sizeof(int));
ftemp_d = (int *) xmalloc((MAX_D + n + 1) * sizeof(int));
for (i = 0; i <= MAX_D + n; ++i) {
flast_d[i] = -1;
ftemp_d[i] = -1;
}
flast_d[F_ORIGIN] = row;
flower = F_ORIGIN - 1;
fupper = F_ORIGIN + 1;
max_row = (int *) xmalloc((MAX_D + 1) * sizeof(int));
min_row = (int *) xmalloc((MAX_D + 1) * sizeof(int));
max_diag = (int *) xmalloc((MAX_D + 1) * sizeof(int));
min_diag = (int *) xmalloc((MAX_D + 1) * sizeof(int));
for (d = 1; d <= MAX_D; d++) {
min_row[d] = m + 1;
max_row[d] = -1;
}
min_row[0] = blast_d[B_ORIGIN + DELTA];
min_diag[0] = B_ORIGIN + DELTA;
max_row[0] = flast_d[F_ORIGIN];
max_diag[0] = F_ORIGIN;
back = forth = -1;
d = 1;
while (d <= max_d) {
/* for each relevant diagonal ... */
for (k = blower; k <= bupper; k++) {
/* process the next edit instruction */
/* find a d on diagonal k */
if (k == -((int) d) + DELTA + B_ORIGIN) {
/* move left from the last d-1 on diagonal k+1 */
row = blast_d[k + 1]; /* INSERT */
} else if (k == (int) d + DELTA + B_ORIGIN) {
/* move up from the last d-1 on diagonal k-1 */
row = blast_d[k - 1] - 1; /* DELETE */
} else if ((blast_d[k] <= blast_d[k + 1])
&& (blast_d[k] - 1 <= blast_d[k - 1])) {
/* substitution */
row = blast_d[k] - 1; /* SUBSTITUTE */
} else if ((blast_d[k - 1] <= blast_d[k + 1] - 1)
&& (blast_d[k - 1] <= blast_d[k] - 1)) {
/* move right from the last d-1 on diagonal k-1 */
row = blast_d[k - 1] - 1; /* DELETE */
} else {
/* move left from the last d-1 on diagonal k+1 */
row = blast_d[k + 1]; /* INSERT */
}
/* code common to the three cases */
col = row + k - B_ORIGIN;
/* slide up the diagonal */
while (row > 0 && col > 0 && (s1[row - 1] == s2[col - 1])) {
--row;
--col;
}
btemp_d[k] = row;
/* if (row == 0 || col == 0) max_d = d; */
} /* for k */
min_row[d] = btemp_d[DELTA + B_ORIGIN];
min_diag[d] = DELTA + B_ORIGIN;
for (k = blower; k <= bupper; ++k) {
blast_d[k] = btemp_d[k];
btemp_d[k] = m + 1;
if (blast_d[k] < min_row[d]) {
min_row[d] = blast_d[k];
min_diag[d] = k;
}
}
/* record cell, if paths overlap with minimum combined cost */
/* obs: it suffices to search up to Cost=min(d-1,(max_d-d)) */
for (Cost = 0; Cost < d; Cost++) {
if ((min_row[d] <= max_row[Cost])
&& ((max_d > d + Cost) || (max_d == d + Cost && (forth < 0)))) {
max_d = d + Cost;
back = d;
forth = Cost;
break;
}
}
--blower; ++bupper;
/* for each relevant diagonal ... */
for (k = flower; k <= fupper; k++) {
/* process the next edit instruction */
/* find a d on diagonal k */
if (k == -((int) d) + F_ORIGIN) {
/* move down from the last d-1 on diagonal k+1 */
row = flast_d[k + 1] + 1; /* DELETE */
} else if (k == (int) d + F_ORIGIN) {
/* move right from the last d-1 on diagonal k-1 */
row = flast_d[k - 1]; /* INSERT */
} else if ((flast_d[k] >= flast_d[k + 1])
&& (flast_d[k] + 1 >= flast_d[k - 1])) {
/* substitution */
row = flast_d[k] + 1; /* SUBSTITUTE */
} else if ((flast_d[k + 1] + 1 >= flast_d[k - 1])
&& (flast_d[k + 1] >= flast_d[k])) {
/* move left from the last d-1 on diagonal k+1 */
row = flast_d[k + 1] + 1; /* DELETE */
} else {
/* move right from the last d-1 on diagonal k-1 */
row = flast_d[k - 1]; /* INSERT */
}
/* code common to the three cases */
col = row + k - F_ORIGIN;
/* slide down the diagonal */
if (row >= 0)
while ((unsigned int) row < m
&& (unsigned int) col < n
&& (s1[row] == s2[col])) {
++row;
++col;
}
ftemp_d[k] = row;
/* if (row == m || col == n) max_d = d; */
} /* for k */
max_row[d] = ftemp_d[F_ORIGIN];
max_diag[d] = F_ORIGIN;
for (k = flower; k <= fupper; ++k) {
flast_d[k] = ftemp_d[k];
ftemp_d[k] = -1;
if (flast_d[k] > max_row[d]) {
max_row[d] = flast_d[k];
max_diag[d] = k;
}
}
/* record backward and forward limits, if minimum combined
* cost in overlapping. Note: it suffices to search up to
* Cost=min(d,(max_d-d)).
*/
for (Cost = 0; Cost <= d; Cost++) {
if ((min_row[Cost] <= max_row[d])
&& ((max_d > d + Cost) || (max_d == d + Cost && (forth < 0)))) {
max_d = d + Cost;
back = Cost;
forth = d;
break;
}
}
--flower; ++fupper;
++d; /* for d */
}
if (d > MAX_D) {
free(blast_d); free(btemp_d);
free(flast_d); free(ftemp_d);
free(min_row); free(min_diag);
free(max_row); free(max_diag);
return d;
}
/*fin:*/
{
unsigned int p1, p2, q1, q2;
if ((int) m - min_row[back] >= max_row[forth]) {
p1 = min_row[back];
p2 = min_row[back] + max_diag[forth] - F_ORIGIN;
q1 = min_row[back];
q2 = min_row[back] + min_diag[back] - B_ORIGIN;
} else {
p1 = max_row[forth];
p2 = max_row[forth] + max_diag[forth] - F_ORIGIN;
q1 = max_row[forth];
q2 = max_row[forth] + min_diag[back] - B_ORIGIN;
}
assert(q1 > 0 || p1 < m);
if (q1 > 0)
add_col_elt(eCol,
new_exon(offset2 + 1, offset1 + 1,
offset2 + p2, offset1 + p1));
if (p1 < m)
add_col_elt(eCol,
new_exon(offset2 + q2 + 1, offset1 + q1 + 1,
offset2 + n, offset1 + m));
}
free(blast_d); free(btemp_d);
free(flast_d); free(ftemp_d);
free(min_row); free(min_diag);
free(max_row); free(max_diag);
return back + forth;
}
static int
about_same_gap_p(unsigned int to1, unsigned int nFrom1,
unsigned int to2, unsigned int nFrom2)
{
unsigned int g1, g2, d;
if (nFrom1 <= to1 || nFrom2 <= to2)
return 0;
g1 = nFrom1 - to1 - 1;
g2 = nFrom2 - to2 - 1;
if (g2 > g1) {
unsigned int tem = g1;
g1 = g2;
g2 = tem;
}
d = g1 - g2;
if ((d * 100) / g1 <= options.gapPct)
return 1;
return 0;
}
/* operates on a list sorted in increasing order of exon coordinates */
static void
compact_exons(collec_p_t eCol, unsigned int W)
{
unsigned int i = 1;
/* Kill stupid overlaping exons. */
while (i < eCol->nb) {
exon_p_t cur = eCol->e.exon[i - 1];
exon_p_t next = eCol->e.exon[i];
unsigned int diff = next->from2 - cur->from2;
if (diff <= options.intron_window) {
eCol->nb -= 1;
if (cur->to2 > next->to2) {
free(next);
memmove(eCol->e.exon + i, eCol->e.exon + i + 1,
(eCol->nb - i) * sizeof(exon_p_t));
if (i < eCol->nb) {
next = eCol->e.exon[i];
cur->to1 += diff;
cur->to2 += diff;
next->from1 -= diff;
next->from2 -= diff;
}
} else {
free(cur);
memmove(eCol->e.exon + i - 1, eCol->e.exon + i,
(eCol->nb - i + 1) * sizeof(exon_p_t));
if (i > 1) {
cur = eCol->e.exon[i - 2];
cur->to1 += diff;
cur->to2 += diff;
next->from1 -= diff;
next->from2 -= diff;
}
}
} else
i += 1;
}
for (i = 1; i < eCol->nb; i++) {
exon_p_t cur = eCol->e.exon[i - 1];
exon_p_t next = eCol->e.exon[i];
if ((next->from1 < cur->to1 + 1 + MIN_INTRON
&& next->from2 <= cur->to2 + 1 + W)
|| about_same_gap_p(cur->to1, next->from1,
cur->to2, next->from2)) {
/* merge blocks cur and next */
cur->to1 = next->to1;
cur->to2 = next->to2;
free(next);
eCol->nb -= 1;
memmove(eCol->e.elt + i, eCol->e.elt + i + 1,
(eCol->nb - i) * sizeof(void *));
i -= 1;
}
}
}
static int
good_ratio(int length, int W)
{
if (length<=W/2) return 2;
else if (length<2*W) return options.cutoff;
else return (int)(.75*P*length+1);
}
static int
extend_bw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2,
int *line1, int *line2, int W)
{
int col, /* column number */
row, /* row number */
max_d, /* bound on the length of the edit script */
d, /* current compressed distance */
k, /* current diagonal */
DELTA, /* n-m */
ORIGIN,
lower,
upper;
int *last_d, *temp_d; /* column containing the last p */
int *min_row, *min_diag; /* min (b)/ max (f) row (and diagonal) */
/* reached for cost d=0, ... m. */
DELTA = n-m;
max_d = m+1;
ORIGIN = m;
for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--)
/*LINTED empty loop body*/;
if ((row == 0) || (col == 0)) {
*line1 = row+offset1;
*line2 = col+offset2;
return 0;
}
last_d = (int *)xmalloc((m+n+1)*sizeof(int));
temp_d = (int *)xmalloc((m+n+1)*sizeof(int));
for (k=0; k<=m+n; ++k) last_d[k]=m+1;
last_d[ORIGIN+DELTA] = row;
lower = ORIGIN + DELTA - 1;
upper = ORIGIN + DELTA + 1;
min_row = (int *)xmalloc((m+1)*sizeof(int));
min_diag = (int *)xmalloc((m+1)*sizeof(int));
for (d=1; d<=m; d++)
min_row[d] = m+1;
min_row[0] = last_d[ORIGIN+DELTA];
min_diag[0] = ORIGIN + DELTA;
d = 0;
while ((++d<=max_d) &&
((d-1<=good_ratio(m-min_row[d-1], W)) ||
((d>=2) && (d-2<=good_ratio(m-min_row[d-2], W))))) {
/* for each relevant diagonal ... */
for (k = lower; k <= upper; k++) {
/* find a d on diagonal k */
if (k==-d+DELTA+ORIGIN) {
/* move down from the last d-1 on diagonal k+1 */
row = last_d[k+1];
/* op = INSERT; */
} else if (k==d+DELTA+ORIGIN) {
/* move right from the last d-1 on diagonal k-1 */
row = last_d[k-1]-1;
/* op = DELETE; */
} else if ((last_d[k]-1<=last_d[k+1]) &&
(last_d[k]-1<=last_d[k-1]-1)) {
/* substitution */
row = last_d[k]-1;
/* op = SUBSTITUTE; */
} else if ((last_d[k-1]-1<=last_d[k+1]) &&
(last_d[k-1]-1<=last_d[k]-1)) {
/* move right from the last d-1 on diagonal k-1 */
row = last_d[k-1]-1;
/* op = DELETE; */
} else {
/* move left from the last d-1 on diagonal k+1 */
row = last_d[k+1];
/* op = INSERT; */
}
/* code common to the three cases */
/* slide down the diagonal */
col = row+k-ORIGIN;
while ((row > 0) && (col > 0) && (s1[row-1]==s2[col-1])) {
row--; col--;
}
temp_d[k] = row;
if ((row == 0) && (col == 0)) {
/* hit southeast corner; have the answer */
free(last_d); free(temp_d);
free(min_row); free(min_diag);
*line1 = row+offset1;
*line2 = col+offset2;
return d;
}
if (row == 0) {
/* hit first row; don't look further */
free(last_d); free(temp_d);
free(min_row); free(min_diag);
*line1 = row+offset1;
*line2 = col+offset2;
return d;
}
if (col == 0) {
/* hit last column; don't look further */
free(last_d); free(temp_d);
free(min_row); free(min_diag);
*line1 = row+offset1;
*line2 = col+offset2;
return d;
}
}
min_row[d] = last_d[ORIGIN+DELTA];
min_diag[d] = ORIGIN+DELTA;
for (k=lower; k<=upper; ++k)
if (temp_d[k]<min_row[d]) {
min_row[d] = temp_d[k];
min_diag[d] = k;
}
for (k=lower; k<=upper; k++) {
last_d[k] = temp_d[k];
}
--lower;
++upper;
}
/* report here the previous maximal match, stored in min_diag and min_row */
while ((d>0) && (min_row[d-1]-min_row[d]<3))
d--;
*line1 = min_row[d]+offset1;
*line2 = min_row[d]+min_diag[d]-ORIGIN+offset2;
free(min_row);
free(min_diag);
free(last_d);
free(temp_d);
return d;
}
static int
extend_fw(uchar *s1, uchar *s2, int m, int n, int offset1, int offset2,
int *line1, int *line2, int W)
{
int col, /* column number */
row, /* row number */
max_d, /* bound on the length of the edit script */
d, /* current compressed distance */
k, /* current diagonal */
ORIGIN,
lower,
upper;
int *last_d, *temp_d; /* column containing the last p */
int *max_row, *max_diag; /* min (b)/ max (f) row (and diagonal) */
/* reached for cost d=0, ... m. */
max_d = m+1;
ORIGIN = m;
for (row=0, col=0; col<n && row<m && (s1[row]==s2[col]); row++, col++)
/*LINTED empty loop body*/;
if (row == m) {
*line1 = row+offset1;
*line2 = col+offset2;
return 0;
}
if (col == n) {
*line1 = row+offset1;
*line2 = col+offset2;
return 0;
}
last_d = (int *)xmalloc((m+n+1)*sizeof(int));
temp_d = (int *)xmalloc((m+n+1)*sizeof(int));
for (k=0; k<=m+n; ++k) last_d[k]=-1;
last_d[ORIGIN] = row;
lower = ORIGIN - 1;
upper = ORIGIN + 1;
max_row = (int *)xmalloc((m+1)*sizeof(int));
max_diag = (int *)xmalloc((m+1)*sizeof(int));
for (d=1; d<=m; d++)
max_row[d] = -1;
max_row[0] = last_d[ORIGIN];
max_diag[0] = ORIGIN;
d = 0;
while ((++d<=max_d) &&
((d-1<=good_ratio(max_row[d-1], W)) ||
((d>=2) && (d-2<=good_ratio(max_row[d-2], W))))) {
/* for each relevant diagonal ... */
for (k = lower; k <= upper; k++) {
/* find a d on diagonal k */
if (k==-d+ORIGIN) {
/* move down from the last d-1 on diagonal k+1 */
row = last_d[k+1]+1;
/* op = DELETE; */
} else if (k==d+ORIGIN) {
/* move right from the last d-1 on diagonal k-1 */
row = last_d[k-1];
/* op = INSERT; */
} else if ((last_d[k]>=last_d[k+1]) &&
(last_d[k]+1>=last_d[k-1])) {
/* substitution */
row = last_d[k]+1;
/* op = SUBSTITUTE; */
} else if ((last_d[k+1]+1>=last_d[k-1]) &&
(last_d[k+1]>=last_d[k])) {
/* move down from the last d-1 on diagonal k+1 */
row = last_d[k+1]+1;
/* op = DELETE; */
} else {
/* move right from the last d-1 on diagonal k-1 */
row = last_d[k-1];
/* op = INSERT; */
}
/* code common to the three cases */
/* slide down the diagonal */
col = row+k-ORIGIN;
if (row>=0)
while ((row < m) && (col < n) && (s1[row]==s2[col])) {
row++; col++;
}
temp_d[k] = row;
if ((row == m) && (col == n)) {
/* hit southeast corner; have the answer */
free(last_d); free(temp_d);
free(max_row); free(max_diag);
*line1 = row+offset1;
*line2 = col+offset2;
return d;
}
if (row == m) {
/* hit last row; don't look further */
free(temp_d); free(last_d);
free(max_row); free(max_diag);
*line1 = row+offset1;
*line2 = col+offset2;
return d;
}
if (col == n) {
/* hit last column; don't look further */
free(temp_d); free(last_d);
free(max_row); free(max_diag);
*line1 = row+offset1;
*line2 = col+offset2;
return d;
}
}
max_row[d] = last_d[ORIGIN];
max_diag[d] = ORIGIN;
for (k=lower; k<=upper; ++k)
if (temp_d[k]>max_row[d]) {
max_row[d] = temp_d[k];
max_diag[d] = k;
}
for (k=lower; k<=upper; k++) {
last_d[k] = temp_d[k];
}
--lower;
++upper;
}
/* report here the previous maximal match, stored in max_diag and max_row */
while ((d>0) && (max_row[d]-max_row[d-1]<3))
d--;
*line1 = max_row[d]+offset1;
*line2 = max_row[d]+max_diag[d]-ORIGIN+offset2;
free(max_row);
free(max_diag);
free(last_d);
free(temp_d);
return d;
/*
if ((d>2) && (max_row[d-1]-max_row[d-2]<3)) {
*line1 = max_row[d-2]+offset1;
*line2 = max_row[d-2]+max_diag[d-2]-ORIGIN+offset2;
free(max_row); free(max_diag);
free(last_d); free(temp_d);
return d-2;
}
*line1 = max_row[d-1]+offset1;
*line2 = max_row[d-1]+max_diag[d-1]-ORIGIN+offset2;
free(max_row);
free(max_diag);
free(last_d);
free(temp_d);
return d-1;
*/
}
static void
swap_seqs(collec_p_t eCol)
{
unsigned int i;
for (i = 0; i < eCol->nb; i++) {
exon_p_t e = eCol->e.exon[i];
int tem = e->from1;
e->from1 = e->from2;
e->from2 = tem;
tem = e->to1;
e->to1 = e->to2;
e->to2 = tem;
}
}
static void
merge(collec_p_t eCol, collec_p_t aCol, unsigned int pos, unsigned int W)
{
unsigned int last = pos + aCol->nb;
unsigned int i;
if (aCol->nb == 0)
return;
/* Make enough room. */
if (eCol->nb + aCol->nb > eCol->size) {
eCol->size = eCol->nb + aCol->nb;
eCol->e.elt = (void **) xrealloc(eCol->e.elt, eCol->size * sizeof(void *));
}
/* Insert the new exons. */
memmove(eCol->e.elt + last,
eCol->e.elt + pos, (eCol->nb - pos) * sizeof(void *));
memcpy(eCol->e.elt + pos, aCol->e.elt, aCol->nb * sizeof(void *));
eCol->nb += aCol->nb;
if (last < eCol->nb)
last += 1;
if (pos == 0)
pos += 1;
for (i = pos; i < last; i++) {
exon_p_t cur = eCol->e.exon[i - 1];
exon_p_t next = eCol->e.exon[i];
/* Check for new exons that migth have gobbled up existing ones. */
if (next->from2 <= cur->from2) {
free(cur);
memmove(eCol->e.elt + i - 1, eCol->e.elt + i,
(eCol->nb - i) * sizeof(void *));
eCol->nb -= 1;
last -= 1;
i -= 1;
continue;
}
if (cur->to2 >= next->to2) {
free(next);
eCol->nb -= 1;
memmove(eCol->e.elt + i, eCol->e.elt + i + 1,
(eCol->nb - i) * sizeof(void *));
last -= 1;
i -= 1;
continue;
}
if (next->from1 < cur->to1 + 1 + MIN_INTRON
&& next->from2 <= cur->to2 + 1 + W) {
/* merge blocks cur and next */
cur->from1 = min(cur->from1, next->from1);
cur->from2 = min(cur->from2, next->from2);
cur->to1 = max(next->to1, cur->to1);
cur->to2 = max(next->to2, cur->to2);
free(next);
eCol->nb -= 1;
memmove(eCol->e.elt + i, eCol->e.elt + i + 1,
(eCol->nb - i) * sizeof(void *));
last -= 1;
i -= 1;
}
}
}
void
free_align(edit_script_list_p_t aligns)
{
edit_script_list_p_t head;
head = aligns;
while ((head=aligns)!=NULL) {
aligns = aligns->next_script;
Free_script(head->script);
free(head);
}
}
#ifdef DEBUG
static void
debug_print_exons(collec_p_t eCol, const char *label,
const unsigned char *s1, const unsigned char *s2)
{
unsigned int i;
fprintf(stderr, "\n====================%s:\n\n", label);
for (i = 0; i < eCol->nb; i++) {
exon_p_t e = eCol->e.exon[i];
fprintf(stderr, " [ %u, %u, %u, %u ]\n",
e->from1, e->from2, e->to1, e->to2);
}
for (i = 0; i < eCol->nb; i++) {
exon_p_t e = eCol->e.exon[i];
int len1 = e->to1 - e->from1 + 1;
int len2 = e->to2 - e->from2 + 1;
if (len1 > 1) {
fprintf(stderr, "%.10s %.*s %.10s\n%.10s %.*s %.10s\n",
(e->from1 > 10)
? s1 + e->from1 - 11
: s1 + e->from1 - 1,
len1, s1 + e->from1 - 1,
s1 + e->to1,
(e->from2 > 10)
? s2 + e->from2 - 11
: s2 + e->from2 - 1,
len2, s2 + e->from2 - 1,
s2 + e->to2);
if (e->from1 > 1 && e->from2 > 1
&& s1[e->from1 - 2] == s2[e->from2 - 2])
fprintf(stderr, "WARNING: further left match: %c\n",
s1[e->from1 - 2]);
if (s1[e->to1] != 0 && s1[e->to1] == s2[e->to2])
fprintf(stderr, "WARNING: further right match: %c\n",
s1[e->to1]);
}
}
}
#endif
static int
perfect_spl_p(uchar *seq1, uchar *seq2, splice_score_p_t splS)
{
unsigned int score, j;
uchar splice[4];
score = SWscore(seq1 + splS->to1 - options.scoreSplice_window,
seq2 + splS->to2 - options.scoreSplice_window,
options.scoreSplice_window);
if (score < options.scoreSplice_window)
return 0;
score = SWscore(seq1 + splS->nFrom1 - 1, seq2 + splS->to2,
options.scoreSplice_window);
if (score < options.scoreSplice_window)
return 0;
memcpy(splice, seq1 + splS->to1, 2);
memcpy(splice + 2, seq1 + splS->nFrom1 - 3, 2);
for (j = 0; j < options.nbSplice; j++) {
if (memcmp(splice, options.splice[j].fwd, 4) == 0) {
splS->type = j;
splS->direction = 1;
return 1;
}
if (memcmp(splice, options.splice[j].rev, 4) == 0) {
splS->type = j;
splS->direction = -1;
return 1;
}
}
return 0;
}
static int
splice_score_compare(const void *a, const void *b)
{
const splice_score_p_t sa = (splice_score_p_t) a;
const splice_score_p_t sb = (splice_score_p_t) b;
if (sa->score < sb->score)
return -1;
if (sa->score > sb->score)
return 1;
if (sa->splScore < sb->splScore)
return -1;
if (sa->splScore > sb->splScore)
return 1;
if (sa->type > sb->type)
return -1;
if (sa->type < sb->type)
return 1;
return 0;
}
static void
compute_max_score_1(uchar *seq1, uchar *seq2, splice_score_p_t splS,
unsigned int type, unsigned int to1, unsigned int to2,
unsigned int nFrom1, uchar *s, uchar *jct, int dir)
{
int j;
memcpy(s + options.scoreSplice_window, jct, 4);
for (j = - options.intron_window; j <= (int) options.intron_window; j++) {
splice_score_t curL, curR;
int i;
curL.type = curR.type = type;
curL.splScore = curR.splScore = 0;
curL.score = curR.score = 0;
memcpy(s, seq2 + to2 - options.scoreSplice_window + j,
options.scoreSplice_window);
memcpy(s + options.scoreSplice_window + 4, seq2 + to2 + j,
options.scoreSplice_window);
for (i = -1; i <= 1; i++) {
splice_score_t cur;
cur.type = type;
cur.splScore = 0;
if (seq1[to1 + j + i] == jct[0])
cur.splScore += 1;
if (seq1[to1 + j + i + 1] == jct[1])
cur.splScore += 1;
cur.score = SWscore(seq1 + to1 - options.scoreSplice_window
+ j + i, s, options.scoreSplice_window + 2);
if (splice_score_compare(&cur, &curL) > 0) {
curL.score = cur.score;
curL.splScore = cur.splScore;
curL.to1 = to1 + j + i;
}
cur.splScore = 0;
if (seq1[nFrom1 - 3 + j + i] == jct[2])
cur.splScore += 1;
if (seq1[nFrom1 - 2 + j + i] == jct[3])
cur.splScore += 1;
cur.score = SWscore(seq1 + nFrom1 - 3 + j + i,
s + options.scoreSplice_window + 2,
options.scoreSplice_window + 2);
if (splice_score_compare(&cur, &curR) > 0) {
curR.score = cur.score;
curR.splScore = cur.splScore;
curR.nFrom1 = nFrom1 + j + i;
}
}
curL.score += curR.score;
curL.splScore += curR.splScore;
if (splice_score_compare(&curL, splS) > 0) {
splS->score = curL.score;
splS->splScore = curL.splScore;
splS->to1 = curL.to1;
splS->to2 = to2 + j;
splS->nFrom1 = curR.nFrom1;
splS->type = type;
splS->direction = dir;
}
}
}
/* FIXME : Frame shifts are a real pain. Look at BM149342 for
* example. The scoring is not quite right in that case. */
static void
compute_max_score(uchar *seq1, uchar *seq2, splice_score_p_t splS,
int direction)
{
unsigned int k;
unsigned int to1 = splS->to1;
unsigned int to2 = splS->to2;
unsigned int nFrom1 = splS->nFrom1;
uchar *s = (uchar *) xmalloc((options.scoreSplice_window * 2 + 4)
* sizeof(uchar));
splS->score = 0;
splS->splScore = 0;
splS->type = -1;
for (k = 0; k < options.nbSplice; k++) {
if (direction >= 0)
compute_max_score_1(seq1, seq2, splS, k, to1, to2, nFrom1, s,
options.splice[k].fwd, 1);
if (direction <= 0)
compute_max_score_1(seq1, seq2, splS, k, to1, to2, nFrom1, s,
options.splice[k].rev, -1);
}
free(s);
}
static void
slide_intron(result_p_t r, uchar *seq1, uchar *seq2)
{
unsigned int i;
/* First, try to get direction through perfect splices. */
for (i = 1; i < r->eCol.nb; i++) {
exon_p_t cur = r->eCol.e.exon[i - 1];
exon_p_t next = r->eCol.e.exon[i];
splice_score_t splS;
cur->type = -1;
cur->direction = 0;