如何根据过滤的pep序列进一步过滤gff3文件--python015

如何根据过滤的pep序列进一步过滤gff3文件？得到最长转录本的gff3过滤结果？

如何根据过滤的pep序列进一步过滤cds序列--python014-CSDN博客

复制代码

filter_gff3_by_pep.py

#!/usr/bin/env python3
"""
根据最长转录本pep文件的ID，筛选原始GFF3文件，保留对应注释（完整层级）
输入：
1. 最长转录本pep文件：Chr_genome_longest_isoform.pep
2. 原始GFF3文件：Chr_genome_all_transcripts_final_gene.gff
输出：
仅含最长转录本的GFF3文件：Chr_genome_longest_isoform.gff3
特性：
- 保留原始GFF3格式（注释行、空行、列顺序、分隔符等）
- 保留完整的注释层级（gene → mRNA → exon/CDS等）
- 容错处理（支持GFF3常见格式变体）
"""
import sys
import os
import re
from collections import defaultdict

def read_pep_transcript_ids(pep_file):
    """
    从pep文件提取最长转录本ID（如Pch01G000010.1）
    返回：转录本ID集合
    """
    transcript_ids = set()
    if not os.path.exists(pep_file):
        print(f"错误：pep文件 {pep_file} 不存在！", file=sys.stderr)
        sys.exit(1)
    
    with open(pep_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                # 提取核心转录本ID（>Pch01G000010.1 gene=xxx → Pch01G000010.1）
                tid = line[1:].split()[0]
                transcript_ids.add(tid)
    
    if not transcript_ids:
        print("错误：从pep文件中未提取到任何转录本ID！", file=sys.stderr)
        sys.exit(1)
    print(f"成功从pep文件提取 {len(transcript_ids)} 个最长转录本ID")
    return transcript_ids

def parse_gff3_relationships(gff3_file):
    """
    解析GFF3文件，建立3类关联关系：
    1. transcript2gene: 转录本ID → 基因ID
    2. gene2transcripts: 基因ID → 转录本ID列表
    3. child2parent: 子特征ID → 父特征ID（exon/CDS → mRNA，mRNA → gene）
    4. id2line: 特征ID → 完整行内容（保留原始格式）
    5. all_lines: 所有行（含注释行、空行），用于后续恢复格式
    """
    transcript2gene = dict()       # mRNA ID → gene ID
    gene2transcripts = defaultdict(list)  # gene ID → [mRNA IDs]
    child2parent = dict()          # 子特征ID → 父特征ID
    id2line = dict()               # 特征ID → 原始行
    all_lines = []                 # 所有行（保留顺序）
    id_pattern = re.compile(r'ID=([^;]+)')  # 匹配ID=xxx的正则
    parent_pattern = re.compile(r'Parent=([^;]+)')  # 匹配Parent=xxx的正则

    if not os.path.exists(gff3_file):
        print(f"错误：GFF3文件 {gff3_file} 不存在！", file=sys.stderr)
        sys.exit(1)
    
    with open(gff3_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            original_line = line  # 保留原始行（含换行符）
            all_lines.append(original_line)
            line_strip = line.strip()
            
            # 跳过注释行、空行
            if not line_strip or line_strip.startswith('#'):
                continue
            
            # 分割GFF3行（GFF3用tab分隔，共9列）
            cols = line_strip.split('\t')
            if len(cols) < 9:
                print(f"警告：第{line_num}行不符合GFF3格式（列数<9），已跳过", file=sys.stderr)
                continue
            
            feature_type = cols[2]  # 第3列：特征类型（gene/mRNA/exon/CDS等）
            attributes = cols[8]    # 第9列：属性列（ID=xxx;Parent=xxx;...）
            
            # 提取ID
            id_match = id_pattern.search(attributes)
            if not id_match:
                continue  # 无ID的行跳过（非核心特征）
            feature_id = id_match.group(1)
            id2line[feature_id] = original_line  # 记录特征ID对应的原始行
            
            # 提取Parent
            parent_match = parent_pattern.search(attributes)
            if parent_match:
                parent_ids = parent_match.group(1).split(',')  # 支持多父级（如exon对应多个mRNA）
                # 只取第一个父级（GFF3规范中mRNA的父级是gene，exon的父级是mRNA）
                parent_id = parent_ids[0]
                child2parent[feature_id] = parent_id
                
                # 建立mRNA ↔ gene的关联
                if feature_type in ['mRNA', 'transcript']:  # 兼容不同的mRNA命名
                    transcript2gene[feature_id] = parent_id
                    gene2transcripts[parent_id].append(feature_id)
    
    print(f"成功解析GFF3文件，共识别 {len(gene2transcripts)} 个基因，{len(transcript2gene)} 个转录本")
    return transcript2gene, gene2transcripts, child2parent, id2line, all_lines

def filter_gff3(transcript_ids, transcript2gene, gene2transcripts, child2parent, id2line, all_lines, output_file):
    """
    筛选GFF3文件，保留最长转录本相关的所有特征
    """
    # 步骤1：确定需要保留的基因ID（所有最长转录本对应的gene）
    keep_gene_ids = set()
    for tid in transcript_ids:
        if tid in transcript2gene:
            keep_gene_ids.add(transcript2gene[tid])
    print(f"需要保留的基因数量：{len(keep_gene_ids)}")
    
    # 步骤2：确定需要保留的所有特征ID（gene + 目标mRNA + 子特征）
    keep_feature_ids = set()
    # 1) 添加需要保留的gene ID
    keep_feature_ids.update(keep_gene_ids)
    # 2) 添加需要保留的mRNA ID
    keep_feature_ids.update(transcript_ids)
    # 3) 递归添加所有子特征（exon/CDS/UTR等）
    # 先收集所有子特征的映射：parent → children
    parent2children = defaultdict(list)
    for child_id, parent_id in child2parent.items():
        parent2children[parent_id].append(child_id)
    
    # 广度优先遍历，收集所有子特征
    queue = list(transcript_ids)  # 从mRNA开始遍历
    while queue:
        current_id = queue.pop(0)
        if current_id in parent2children:
            children = parent2children[current_id]
            for child in children:
                if child not in keep_feature_ids:
                    keep_feature_ids.add(child)
                    queue.append(child)  # 继续遍历子特征的子特征（如exon无下级，会终止）
    
    print(f"需要保留的总特征数量：{len(keep_feature_ids)}")
    
    # 步骤3：生成筛选后的GFF3文件（保留原始格式）
    with open(output_file, 'w', encoding='utf-8') as f_out:
        # 遍历所有原始行，判断是否保留
        for line in all_lines:
            line_strip = line.strip()
            # 保留注释行、空行
            if not line_strip or line_strip.startswith('#'):
                f_out.write(line)
                continue
            
            # 处理特征行：判断是否在保留的特征ID中
            cols = line_strip.split('\t')
            if len(cols) < 9:
                continue  # 跳过格式错误的行
            
            attributes = cols[8]
            id_match = id_pattern.search(attributes)
            if id_match and id_match.group(1) in keep_feature_ids:
                f_out.write(line)  # 保留目标特征行
    
    print(f"筛选完成！最长转录本GFF3已保存至：{output_file}")

def main():
    # 检查命令行参数
    if len(sys.argv) != 4:
        print("用法：python3 filter_gff3_by_pep.py <pep文件> <原始GFF3文件> <输出GFF3文件>", file=sys.stderr)
        print("示例：", file=sys.stderr)
        print("python3 filter_gff3_by_pep.py Chr_genome_longest_isoform.pep Chr_genome_all_transcripts_final_gene.gff Chr_genome_longest_isoform.gff3", file=sys.stderr)
        sys.exit(1)
    
    # 解析参数
    pep_file = sys.argv[1]
    gff3_file = sys.argv[2]
    output_file = sys.argv[3]
    
    # 步骤1：提取pep中的转录本ID
    transcript_ids = read_pep_transcript_ids(pep_file)
    
    # 步骤2：解析GFF3的关联关系
    transcript2gene, gene2transcripts, child2parent, id2line, all_lines = parse_gff3_relationships(gff3_file)
    
    # 步骤3：筛选并生成新的GFF3
    filter_gff3(transcript_ids, transcript2gene, gene2transcripts, child2parent, id2line, all_lines, output_file)
    
    # 可选：验证结果（输出前5个保留的mRNA特征）
    print("\n筛选后的GFF3文件中前5个mRNA特征：")
    count = 0
    with open(output_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() and not line.startswith('#') and 'mRNA' in line.split('\t')[2]:
                print(line.strip())
                count += 1
                if count >= 5:
                    break

# 全局正则（避免重复编译）
id_pattern = re.compile(r'ID=([^;]+)')
parent_pattern = re.compile(r'Parent=([^;]+)')

if __name__ == "__main__":
    main()

使用及结果

bash 复制代码

# 执行筛选
python3 filter_gff3_by_pep.py Chr_genome_longest_isoform.pep Chr_genome_all_transcripts_final_gene.gff Chr_genome_longest_isoform.gff3