環境準備及背景介紹
目錄結構:
2
Python 實現
BaimoTools.py
1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# @Author : Baimoc 4# @Email : baimoc@163.com 5# @Time : 2021/3/17 14:28 6# @File : BaimoTools 7import os 8import time 9 10from Bio import SeqIO, SeqFeature 11 12 13class BaimoTools(): 14 def __init__(self, gb_file, fasta_file): 15 self.complete_fasta = "" 16 self.fasta_file = fasta_file 17 self.gb_file = gb_file 18 self.feature = None 19 self.record = None 20 21 def format_val(self, object=None): 22 """ 23 格式化物件值為字串 24 :param object: 物件或物件鍵值 25 :return: 26 """ 27 key = "" 28 # 判斷引數是否為字元 29 if isinstance(object, str): 30 obj = self.feature.qualifiers 31 key = object 32 else: 33 obj = object 34 # 為字元,提取 feature.qualifiers 物件關鍵字 35 if key != "" and not obj.get(key): 36 return "" 37 elif key == "": 38 val = obj 39 else: 40 val = obj[key] 41 # 轉換為字串 42 if not len(val): 43 val = "" 44 elif len(val) == 1: 45 val = val[0] 46 else: 47 if isinstance(val, SeqFeature.CompoundLocation) or isinstance(val, SeqFeature.FeatureLocation): 48 val = str(val) 49 else: 50 val = " | ".join(val) 51 return val 52 53 def extract_cds(self, cds): 54 """ 55 獲取 CDS 的 Fasta 序列 56 :param cds: 獲取指定基因的 CDS 區域,如果為空,則獲取全部 57 """ 58 records = list(SeqIO.parse(self.gb_file, "genbank")) 59 for record in records: 60 print(f"{record.id}") 61 for feature in record.features: 62 # 提取 CDS 資訊 63 try: 64 if feature.type == "CDS": 65 self.feature = feature 66 self.record = record 67 cds_gene = self.format_val('gene') 68 if cds == "": 69 self.complete_fasta += self.format_fasta() 70 elif isinstance(cds, str) and cds_gene == cds: 71 self.complete_fasta += self.format_fasta() 72 elif isinstance(cds, list) and cds_gene in cds: 73 self.complete_fasta += self.format_fasta() 74 except: 75 print(f"{record.id}: No CDS features") 76 self.write_file() 77 78 def write_file(self): 79 """ 80 寫入檔案 81 """ 82 with open(self.fasta_file, "w") as f_obj: 83 f_obj.writelines(self.complete_fasta) 84 85 def format_fasta(self, num=0): 86 """ 87 整理 Fasta 格式 88 :param num: 每行字元數,超出則換行 89 :return: Fasta 文字 90 """ 91 cds_gene = self.format_val('gene') 92 cds_location = self.format_val(self.feature.location) 93 cds_product = self.format_val('product') 94 cds_protein_id = self.format_val('protein_id') 95 cds_translation = self.format_val('translation') 96 complete_ana = f">{self.record.id} | {cds_gene} | {cds_product} | {cds_protein_id} | {str(cds_location)}\n" 97 format_seq = "" 98 if num: 99 for i, char in enumerate(cds_translation):100 format_seq += char101 if (i + 1) % num == 0:102 format_seq += "\n"103 else:104 format_seq = cds_translation105 return complete_ana + format_seq + "\n"
3
使用示例
1
資料介紹
示例資料為新冠病毒的基因組 genbank 檔案,檔案中包含:
兩個基因組:LC553263.1 和 LC553262.1
一個基因組會有多個基因,下面是它的基因組結構:
2
提取單個基因CDS
main.py
1from BaimoTools import BaimoTools23gb_file = f"res/genbank/SARS-CoV-2.gb"4fasta_file = f"out/output_s.fasta"5baimoTools = BaimoTools(gb_file, fasta_file)6# baimoTools.extract_cds('S')
輸出檔案 output_s.fasta
,分別提取到兩個基因組的 S 基因 CDS 區域:
3
提取多個基因CDS
main.py
1from BaimoTools import BaimoTools23gb_file = f"res/genbank/SARS-CoV-2.gb"4fasta_file = f"out/output_s_m_orf10.fasta"5baimoTools = BaimoTools(gb_file, fasta_file)6baimoTools.extract_cds(['S', 'M', 'ORF10'])
輸出檔案 output_s_m_orf10.fasta
,分別提取到兩個基因組的 S,M,ORF10 基因 CDS 區域::
4
提取全部基因CDS
main.py
1from BaimoTools import BaimoTools23gb_file = f"res/genbank/SARS-CoV-2.gb"4fasta_file = f"out/output_s.fasta"5baimoTools = BaimoTools(gb_file, fasta_file)6# baimoTools.extract_cds("")
輸出檔案 output_all.fasta
,分別提取到兩個基因組的全部基因 CDS 區域:
下一步更新其他基因特徵提取,及格式轉換功能。
本作品採用《CC 協議》,轉載必須註明作者和本文連結