加载中…
个人资料
  • 博客等级:
  • 博客积分:
  • 博客访问:
  • 关注人气:
  • 获赠金笔:0支
  • 赠出金笔:0支
  • 荣誉徽章:
正文 字体大小:

Smith-Waterman算法实现(C)

(2011-07-17 16:29:26)
标签:

smith-waterman

生物计算

c

it

Smith-Waterman Local Similarity Algorithm Implementation


We implement the program of Smith-Waterman local similarity algorithm. With this algorithm, we cannot simply define the cost of each action in functions like match and indel, since we may use different scoring matrices, such as BLOSUM80 and BLOSUM62, to build my scoring function. I just store this kind of matrix in an input file called matrix and my program will read the information of the matrix from the input file. For the two sequences to be analyzed, we store them in two files called inputs and inputt.

 

(1) First we work with the normal matrix provided in the class note. Thus, we should use the original match and indel function provided by class note.

 

For DNA sequences, we can get some locally similar part of two sequences. They are almost same and have some different bases in some position. But they can still encode the similar even the same protein sequence. .

 

For plain text string, we use the same scoring matrix with previous one. We would like to find some difference between the alignments of text string and DNA sequence, but there is no obvious difference. What we can merely find is just the result local alignment of two sequences.

 

(2) Then, we run the program with scoring matrix BLOSUM62 and some sequences from different kind of organisms. For instance, the cost of the alignment of protein sequence from Fowlpox PIR and the chain K of Crystal Structure Of The Swine-Origin A is 847.

 

In this part, we define S, the rate of similarity of alignment as, S=A/L, in which A is the length of final alignment and L is the sum of length of two sequences.

 

For instance, when we run the program on two pieces of protein sequences all from human, the similarity score is higher than the similarity score obtained from alignment of sequence from human and virus.

 

And we do this on more pairs of sequences. We find that it is a little bit more often for two sequences from same organism to own a larger similarity score than that from different organisms.

 

(3) Finally, We run the program with both matrix BLOSUM80 and BLOSUM62 on the same sequence to see the difference between these two scoring matrices. For example, I separately run the program with BLOSUM62 and BLOSUM80 on protein sequences from Fowlpox PIR and crystal structure of the Swine-Origin A. According to the output of program, the result of former one is 847 and the result of later one is 779. And when we work on two pieces of protein sequences from cat with both BLOSUM80 and BLOSUM62, the score with BLOSUM62 is 581 and the score with BLOSUM80 is 505.

 

Then we test this on more sets of data, and we find that when we use BLOSUM62, the score is always larger. We think it is because of the scoring matrix. For the matches appearing more often, BLOSUM62 gives them a higher score than BLOSUM80.


Smith-Waterman算法如下:


//inputs--->sequence s

//inputt--->sequence t

//matrix--->stroing the scoring matrix

 

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

 

#define MAXLINE 6000

#define MATCH  0

#define INSERT 1

#define DELETE 2       

 

typedef struct {

       int cost;

    int parent;

} cell;

 

cell m[MAXLINE][MAXLINE];

int num_base;

int **matrix;

char *bases;

 

void goal_cell(char *s, char *t, int *x, int *y); 

void cell_init(int i, int j);

int string_compare(char *s, char *t);

void reconstruct_path(char *s, char *t, int i, int j, int index);

void print_matrix(char *s, char *t);

int getIndex(char ch);

int getScore(char a, char b);

 

main()

{

       int count;

       char ch;

 

       FILE *matrixfp;

       matrixfp=fopen("matrix", "r");

 

       char ch_numBase[MAXLINE];

       fgets(ch_numBase, MAXLINE, matrixfp);

 

       num_base=atoi(ch_numBase); 

 

       bases=(char *)malloc(sizeof(char)*num_base);

       matrix=(int **)malloc(sizeof(int *)*num_base);

       for(count=0;count<num_base;count++)

       {

              matrix[count]=(int *)malloc(sizeof(int)*num_base);

       }

 

       for(count=0;count<num_base;count++)

       {

              ch=fgetc(matrixfp);

              bases[count]=ch;

              ch=fgetc(matrixfp);

       }

        

       char tempScore[MAXLINE];

 

       int count_1;

       int count_2;

       for(count_1=0;count_1<num_base;count_1++)

       {

              for(count_2=0;count_2<num_base;count_2++)

              {

                     fgets(tempScore, MAXLINE, matrixfp);

                     matrix[count_1][count_2]=atoi(tempScore);

              }

       }

 

       for(count_1=0;count_1<num_base;count_1++)

       {

              for(count_2=0;count_2<num_base;count_2++)

              {

                     printf(" %d ", matrix[count_1][count_2]);

              }

              printf("\n");

       }

 

       fclose(matrixfp);

 

       FILE *fdps;

       char s[MAXLINE];

       s[0]=' ';

       fdps=fopen("inputs","r");

       fgets(&s[1], MAXLINE, fdps);

       fclose(fdps);

        

       FILE *fdpt;

       char t[MAXLINE];

       t[0]=' ';

       fdpt=fopen("inputt","r");

       fgets(&t[1], MAXLINE, fdpt);

       fclose(fdpt);

        

       int i,j;

 

       int maxscore=string_compare(s,t);

 

       printf("\nMaximun Score : %d \n\n", maxscore);

       print_matrix(s,t); 

       printf("\n");

 

       goal_cell(s,t,&i,&j);

       printf("Position With Maximum Score: (%d,%d)\n\n",i,j);

 

       printf("Alignment First: ");

       reconstruct_path(s,t,i,j,1);

       printf("\n");

 

       printf("Alignment Second:");

       reconstruct_path(s,t,i,j,2);

       printf("\n");

       printf("\nMaximun Score : %d \n\n", maxscore);

}

 

void goal_cell(char *s, char *t, int *x, int *y)

{

       int i,j;      

       *x=*y=0;       

       for(i=0;i<strlen(s)-1;i++)

       {

              for(j=0;j<strlen(t)-1;j++)

              {

                     if(m[i][j].cost>m[*x][*y].cost)

                     {

                            *x=i;

                            *y=j;

                     }

              }

       }

}

 

void cell_init(int i, int j)

{

       m[i][j].cost=0;

       m[i][j].parent=-1;

}

 

int string_compare(char *s, char *t)

{

       int i,j,k;

       int opt[3];

 

       for (i=0; i<=strlen(s)-1; i++) 

       {

              for(j=0; j<=strlen(t)-1; j++)

              {

                     cell_init(i,j);

              }

       }

 

       for (i=1; i<strlen(s)-1; i++)

       {

              for (j=1; j<strlen(t)-1; j++) 

              {

                     opt[MATCH] = m[i-1][j-1].cost + getScore(s[i],t[j]);  

                     opt[INSERT] = m[i][j-1].cost + getScore(s[i],t[j]);

                     opt[DELETE] = m[i-1][j].cost + getScore(s[i],t[j]);

                     m[i][j].cost = 0;

                     m[i][j].parent = -1;

 

                     for (k=MATCH; k<=DELETE; k++)

                     {

                            if (opt[k] > m[i][j].cost) 

                            {

                                   m[i][j].cost = opt[k];

                                   m[i][j].parent = k;

                            }

                     }

              }

       }

       goal_cell(s,t,&i,&j);

       return( m[i][j].cost );

} 

 

//index 1->s 2->t

void reconstruct_path(char *s, char *t, int i, int j, int index)

{

       if (m[i][j].parent == -1) return;

 

       if (m[i][j].parent == MATCH) 

       {

              reconstruct_path(s,t,i-1,j-1, index);

              printf("%c",s[i]);

              return;

       }

        if (m[i][j].parent == INSERT) 

              {

                     reconstruct_path(s,t,i,j-1,index);

                      

                     if(index==2)

                     {    

                            printf("%c",t[j]);

                     }

               

                     return;

        }

        if (m[i][j].parent == DELETE) 

              {

            reconstruct_path(s,t,i-1,j,index);

                     if(index==1)

                     {

                            printf("%c",s[i]);

                     }

 

                     return;

        }

}

 

void print_matrix(char *s, char *t)

{

       int i,j;                   

       int x,y;

 

       x = strlen(s)-1;

       y = strlen(t)-1;

 

       printf("   ");

       for (i=0; i<y; i++)

       {

              printf("  %c",t[i]);

       }

       printf("\n");

 

       for (i=0; i<x; i++) 

       {

              printf("%c: ",s[i]);

              for (j=0; j<y; j++) 

              {

                     printf(" -",m[i][j].cost);

              }

              printf("\n");

       }

}

 

int getIndex(char ch)

{

       int count;

 

       for(count=0;count<num_base;count++)

       {

              if(bases[count]==ch)

              {

                     return count;

              }

       }

 

       return -1;

}

 

int getScore(char a, char b)

{

       int index_a=getIndex(a);

       int index_b=getIndex(b);

 

       return matrix[index_a][index_b];

}


0

阅读 收藏 喜欢 打印举报/Report
  

新浪BLOG意见反馈留言板 欢迎批评指正

新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 产品答疑

新浪公司 版权所有