7-12 鹼基序列匹配 (25 分)

中飛院JLF發表於2018-11-21

7-12 鹼基序列匹配 (25 分)

鹼基序列匹配

地理專案是IBM和國家地理學會的合作研究專案,從成千上萬捐獻的DNA分析地球上人類是如何繁衍的。作為一個IBM的研究人員,請你寫一個程式找出給定的DNA片段之間的相同之處,使得對個體的調查相關聯。一個DNA鹼基序列是指把在分子中發現的氮基的序列給羅列出來。有四種氮基:腺嘌呤 (A)、胸腺嘧啶(T)、鳥嘌呤(G)和胞嘧啶(D),例如,一個6鹼基DNA序列可以表示為 TAGACC。給出一個DNA鹼基序列的集合,確定在所有序列中都出現的最長的鹼基序列。

輸入格式:
輸入的第一行給出了整數n,表示測試資料集合的數目。每個測試資料集合由下述兩部分組成:一個正整數m(2≤m≤10),給出資料集合中鹼基序列的數目。m行,每行給出一個60鹼基的鹼基序列。

輸出格式:
對於輸入的每個測試資料集合的所有的鹼基序列,輸出最長的相同的鹼基子序列。如果最長的相同的鹼基子序列的長度小於3,則輸出“no significant commonalities”來代替鹼基子序列。如果相同最長長度的子序列有多個,則僅輸出按字母排序的第一個。

輸入樣例1:

3
2
GATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
3
GATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATA
GATACTAGATACTAGATACTAGATACTAAAGGAAAGGGAAAAGGGGAAAAAGGGGGAAAA
GATACCAGATACCAGATACCAGATACCAAAGGAAAGGGAAAAGGGGAAAAAGGGGGAAAA
3
CATCATCATCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
ACATCATCATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AACATCATCATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT

輸出樣例1:

no significant commonalities
AGATAC
CATCATCAT

輸入樣例2:

5
2
GATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
3
GATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATA
GATACTAGATACTAGATACTAGATACTAAAGGAAAGGGAAAAGGGGAAAAAGGGGGAAAA
GATACCAGATACCAGATACCAGATACCAAAGGAAAGGGAAAAGGGGAAAAAGGGGGAAAA
3
CATCATCATCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
ACATCATCATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AACATCATCATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
2
GATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGAT
GATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATT
10
GATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGAT
GATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATT
GATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTT
GATTTTGATTTTGATTTTGATTTTGATTTTGATTTTGATTTTGATTTTGATTTTGATTTT
AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT
AAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGAT
AAAGATAAAGATAAAGATAAAGATAAAGATAAAGATAAAGATAAAGATAAAGATAAAGAT
CGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT
CCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGAT
CCCGATCCCGATCCCGATCCCGATCCCGATCCCGATCCCGATCCCGATCCCGATCCCGAT

輸出樣例2:

no significant commonalities
AGATAC
CATCATCAT
TGAT
GAT

輸入樣例3:

11
2
GATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
3
GATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATACCAGATA
GATACTAGATACTAGATACTAGATACTAAAGGAAAGGGAAAAGGGGAAAAAGGGGGAAAA
GATACCAGATACCAGATACCAGATACCAAAGGAAAGGGAAAAGGGGAAAAAGGGGGAAAA
3
CATCATCATCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
ACATCATCATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AACATCATCATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
2
GATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGAT
GATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATT
10
GATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGATGAT
GATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATTGATT
GATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTTGATTT
GATTTTGATTTTGATTTTGATTTTGATTTTGATTTTGATTTTGATTTTGATTTTGATTTT
AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT
AAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGATAAGAT
AAAGATAAAGATAAAGATAAAGATAAAGATAAAGATAAAGATAAAGATAAAGATAAAGAT
CGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT
CCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATCCGAT
CCCGATCCCGATCCCGATCCCGATCCCGATCCCGATCCCGATCCCGATCCCGATCCCGAT
10
GATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATC
GCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTA
TAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC
CGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT
GTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT
GAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA
GCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGC
ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
TCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTC
10
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
10
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG
GGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG
GGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGG
GGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGG
GGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGG
GGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGG
GGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGG
GGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGG
4
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
10
AAAAAATAAAAAATAAAAAATAAAAAATAAAAAATAAAAAATAAAAAATAAAAAATAAAA
AAAAACAAAAACAAAAACAAAAACAAAAACAAAAACAAAAACAAAAACAAAAACAAAAAC
AAAAAGAAAAAGAAAAAGAAAAAGAAAAAGAAAAAGAAAAAGAAAAAGAAAAAGAAAAAG
AAAAATAAAAATAAAAATAAAAATAAAAATAAAAATAAAAATAAAAATAAAAATAAAAAT
AAAACAAAACAAAACAAAACAAAACAAAACAAAACAAAACAAAACAAAACAAAACAAAAC
AAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAG
AAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAAT
AAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAAAC
AAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAAAT
AAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAG
2
GATGATGCATCATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGACTACTAA
GATGATCATCATACTACTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
ACTACTAGATGATGCATCATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGA
ACTACTCGATGATCATCATCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

輸出樣例3:

no significant commonalities
AGATAC
CATCATCAT
TGAT
GAT
no significant commonalities
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
no significant commonalities
AAA
ACTACT

此題博主是用字串和KMP演算法及遍歷每一個子串做出來的。
程式碼如下:

#include<bits/stdc++.h>
using namespace std;

void getNext(string S,int* next)  //得到子串下面的陣列
{
    int j,k;
    j=0;k=-1;
    next[0]=-1;  //子串0號元素下面數為-1
    while(j<(S.size()-1))  //對子串所有元素下面賦值
    {
        if(k==-1||S[j]==S[k])  //如果k回到了第一個元素或者第j個元素等於第k個元素
        {
            j++;k++;  //j++;k++;
            next[j]=k;  //子串第j個元素下面的數為k
        }
        else
            k=next[k];  //k為第子串第k個元素下面的數
    }
}

bool bijiao(string T,string *S,int n)  //返回該子串是否是每一個序列的子串
{
    int a=T.size();  //得到子串T的長度
    int next[a];  //建立子串的陣列下標
    getNext(T,next);  //給子串陣列賦值
    int hhh[n];  //建立一個大小為n的陣列判斷子串是不是n-1個主串的公共子串
    for(int i=0;i<n;i++)
    {
        hhh[i]=0;  //給陣列hhh全賦初值0
    }
    for(int l=1;l<n;l++)
    {
        int aa=S[l].size();  //得到主串的長度
        int i=0,j=0;
        while(i<aa)  //當主串下標沒到達尾部時
        {
            if(j==-1||S[l][i]==T[j]){
            ++i;
            ++j;
        }
        else
            j=next[j];
        if(j==a){
            hhh[l]=1;
            break;
        }
        }
    }
    for(int i=1;i<n;i++)  //檢視該子串是否為每一個主串下面的子串
    {
        if(hhh[i]!=1)
            return false;  //不是則返回false
    }
    return true;  //反之是則返回true
}

void chuli(string *aar,int n)
{
    string key="Z";  //假定最長公共序列key
    string try1;  //第一個鹼基序列的每一個字串
    int w=0;
    for(int i=60;i>=3;i--)  //從最長字串長度開始作為子串長度
    {
        if(w!=0&&i<w)
                {
                    cout<<key<<endl;return ;
                }
        for(int k=0;k<=60-i;k++)  //開始位置
        {
            try1=aar[0].substr(k,i);  //第一個鹼基序列的一個字串
            //cout<<try1<<endl;
            if(bijiao(try1,aar,n))  //檢視是否為公共字串
            {
                w=i;
                if((try1.size()>=key.size())&&(try1<key))
                key=try1;
            }
        }
    }
    if(key.size()<3)
    cout<<"no significant commonalities"<<endl;
    else
        cout<<key<<endl;
}

int main()
{
    int N;
    cin>>N; //輸入資料集合的數目N
    for(int z=0;z<N;z++)  //輸入集合的每一組元素
    {
        int n;
        cin>>n;  //輸入資料集合中鹼基序列的數目n
        string jjsz[n];  //建立jjsz[n]陣列存放每一組鹼基序列
        for(int x=0;x<n;x++)
        {
            cin>>jjsz[x];  //存放每一個鹼基序列
        }
        chuli(jjsz,n);  //開始處理
    }
    return 0;
}

相關文章