多序列比对(Multiple Sequence Alignment,MSA)是对多个生物序列进行对齐的过程,以揭示它们之间的共同模式和结构。在生物信息学中,有多种文件格式用于存储多序列比对的结果,其中包括Stockholm (.sto) 和 A3M (.a3m) 格式。
注释信息: Stockholm文件通常包含更多的注释信息,提供关于序列和比对的额外信息。相比之下,A3M文件主要关注于序列本身。
应用领域: Stockholm文件广泛用于多种生物信息学工具,如HMMER。A3M文件通常用于蛋白质结构预测和深度学习模型的训练,如Alphafold。
from typing import Sequence, Iterable, Optional
def _convert_sto_seq_to_a3m(
query_non_gaps: Sequence[bool], sto_seq: str) -> Iterable[str]:
for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
if is_query_res_non_gap:
yield sequence_res
elif sequence_res != '-':
yield sequence_res.lower()
def convert_stockholm_to_a3m(stockholm_format: str,
max_sequences: Optional[int] = None,
remove_first_row_gaps: bool = True) -> str:
"""Converts MSA in Stockholm format to the A3M format."""
descriptions = {}
sequences = {}
reached_max_sequences = False
for line in stockholm_format.splitlines():
reached_max_sequences = max_sequences and len(sequences) >= max_sequences
if line.strip() and not line.startswith(('#', '//')):
# Ignore blank lines, markup and end symbols - remainder are alignment
# sequence parts.
seqname, aligned_seq = line.split(maxsplit=1)
if seqname not in sequences:
if reached_max_sequences:
continue
sequences[seqname] = ''
sequences[seqname] += aligned_seq
for line in stockholm_format.splitlines():
if line[:4] == '#=GS':
# Description row - example format is:
# #=GS UniRef90_Q9H5Z4/4-78 DE [subseq from] cDNA: FLJ22755 ...
columns = line.split(maxsplit=3)
seqname, feature = columns[1:3]
value = columns[3] if len(columns) == 4 else ''
if feature != 'DE':
continue
if reached_max_sequences and seqname not in sequences:
continue
descriptions[seqname] = value
if len(descriptions) == len(sequences):
break
# Convert sto format to a3m line by line
a3m_sequences = {}
if remove_first_row_gaps:
# query_sequence is assumed to be the first sequence
query_sequence = next(iter(sequences.values()))
query_non_gaps = [res != '-' for res in query_sequence]
for seqname, sto_sequence in sequences.items():
# Dots are optional in a3m format and are commonly removed.
out_sequence = sto_sequence.replace('.', '')
if remove_first_row_gaps:
out_sequence = ''.join(
_convert_sto_seq_to_a3m(query_non_gaps, out_sequence))
a3m_sequences[seqname] = out_sequence
fasta_chunks = (f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
for k in a3m_sequences)
return '\n'.join(fasta_chunks) + '\n' # Include terminating newline
with open("test_aln.sto") as f:
sto_str = f.read()
print(sto_str)
print("====")
a3m_str = convert_stockholm_to_a3m(sto_str)
print(a3m_str)