python脚本过滤得到non-overlap的utr

使用该脚本对上述的结果"lin_20240321_calculating_rG4score.R"进行过滤

python 复制代码
import csv

def read_file(file_path):
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file, delimiter='\t')
        return list(reader)

def process_sequences(data):
    gene_sequences = {}
    for row in data:
        gene_id = row['Id']
        start = int(row['Start'])
        end = int(row['End'])
        length=int(row['total_length'])
        score = float(row['G4Hscore'])

        if gene_id not in gene_sequences:
            gene_sequences[gene_id] = []

        gene_sequences[gene_id].append({
            'Type': row['Type'],
            'Start': start,
            'End': end,
            'Length': length,
            'Sequence': row['Sequence'],
            'Score': score
        })

    # 对每个基因的序列按分数降序排序
    for gene_id, sequences in gene_sequences.items():
        gene_sequences[gene_id] = sorted(sequences, key=lambda x: x['Score'], reverse=True)

    # 保留分数最高且不重叠的序列
    final_selection = {}
    for gene_id, sequences in gene_sequences.items():
        final_selection[gene_id] = []
        for seq in sequences:
            if not any(seq['Start'] < s['End'] and seq['End'] > s['Start'] for s in final_selection[gene_id]):
                final_selection[gene_id].append(seq)

    return final_selection

def write_results(gene_sequences, output_file):
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(['Id', 'Type', 'Start', 'End', 'Total_length','Sequence', 'Score'])
        for gene_id, sequences in gene_sequences.items():
            for seq in sequences:
                writer.writerow([gene_id, seq['Type'], seq['Start'], seq['End'], seq['Length'], seq['Sequence'], seq['Score']])

# 输入和输出文件路径
#usage:python lin_filter_non-overlap_rg4.py -f1 lijinonextended_3utr_allrg4output1.fasta -f2 lijinonextended_3utr_allrg4output2.fasta
import argparse
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="input1")
parser.add_argument("-f2","--file2",help="input2")
args = parser.parse_args()

# 读取文件
data = read_file(args.file1)
# 处理序列,保留得分最高且不重叠的序列
gene_sequences = process_sequences(data)
# 将结果写入新文件
write_results(gene_sequences, args.file2)
相关推荐
学测绘的小杨6 小时前
CompassFusion:一个从 GNSS 到 GNSS/INS 组合导航的独立工程包
python
zzzzzz31012 小时前
当产品经理说这个很简单:我用Python自动化处理奇葩需求的实战指南
python·pycharm·产品经理
雪隐13 小时前
个人电脑玩AI-06让5060 Ti给你打工——不光能画画,Qwen3-TTS还能学人说话,连我老板都信了!
人工智能·后端·python
兵慌码乱1 天前
面向桌面端的资产管理系统分层架构设计与核心模块实现
python·系统架构·sqlite·pyqt5·数据库设计·桌面应用开发·mvc架构
hboot1 天前
AI工程师第三课 - 机器学习基础
python·scikit-learn·kaggle
顾林海1 天前
Agent入门阶段-编程基础-Python:流程控制
python·agent·ai编程
呱呱复呱呱1 天前
Django CBV 源码解读:一个请求是怎么找到你的 get() 方法的
python·django
曲幽2 天前
刚部署的 LibreTranslate 频频翻车?我掏出了 20 年前的 StarDict 词典,用 FastAPI 搭了个本地词典翻译 API
python·fastapi·web·translate·goldendict·libretranslate·stardict·pystardict
荣码2 天前
用Streamlit给AI应用套个界面,10行代码出Web页面
java·python
兵慌码乱2 天前
基于Python+PyQt5+SQLite的药房管理系统实现:事务一致性与界面解耦全流程解析
python·sqlite·信号与槽·pyqt5·数据库设计·桌面应用开发·事务处理