python脚本过滤得到non-overlap的utr

使用该脚本对上述的结果"lin_20240321_calculating_rG4score.R"进行过滤

python 复制代码
import csv

def read_file(file_path):
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file, delimiter='\t')
        return list(reader)

def process_sequences(data):
    gene_sequences = {}
    for row in data:
        gene_id = row['Id']
        start = int(row['Start'])
        end = int(row['End'])
        length=int(row['total_length'])
        score = float(row['G4Hscore'])

        if gene_id not in gene_sequences:
            gene_sequences[gene_id] = []

        gene_sequences[gene_id].append({
            'Type': row['Type'],
            'Start': start,
            'End': end,
            'Length': length,
            'Sequence': row['Sequence'],
            'Score': score
        })

    # 对每个基因的序列按分数降序排序
    for gene_id, sequences in gene_sequences.items():
        gene_sequences[gene_id] = sorted(sequences, key=lambda x: x['Score'], reverse=True)

    # 保留分数最高且不重叠的序列
    final_selection = {}
    for gene_id, sequences in gene_sequences.items():
        final_selection[gene_id] = []
        for seq in sequences:
            if not any(seq['Start'] < s['End'] and seq['End'] > s['Start'] for s in final_selection[gene_id]):
                final_selection[gene_id].append(seq)

    return final_selection

def write_results(gene_sequences, output_file):
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(['Id', 'Type', 'Start', 'End', 'Total_length','Sequence', 'Score'])
        for gene_id, sequences in gene_sequences.items():
            for seq in sequences:
                writer.writerow([gene_id, seq['Type'], seq['Start'], seq['End'], seq['Length'], seq['Sequence'], seq['Score']])

# 输入和输出文件路径
#usage:python lin_filter_non-overlap_rg4.py -f1 lijinonextended_3utr_allrg4output1.fasta -f2 lijinonextended_3utr_allrg4output2.fasta
import argparse
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="input1")
parser.add_argument("-f2","--file2",help="input2")
args = parser.parse_args()

# 读取文件
data = read_file(args.file1)
# 处理序列,保留得分最高且不重叠的序列
gene_sequences = process_sequences(data)
# 将结果写入新文件
write_results(gene_sequences, args.file2)
相关推荐
少林码僧4 小时前
2.31 机器学习神器项目实战:如何在真实项目中应用XGBoost等算法
人工智能·python·算法·机器学习·ai·数据挖掘
智航GIS4 小时前
10.4 Selenium:Web 自动化测试框架
前端·python·selenium·测试工具
jarreyer4 小时前
摄像头相关记录
python
宝贝儿好4 小时前
【强化学习】第六章:无模型控制:在轨MC控制、在轨时序差分学习(Sarsa)、离轨学习(Q-learning)
人工智能·python·深度学习·学习·机器学习·机器人
大、男人5 小时前
python之asynccontextmanager学习
开发语言·python·学习
默默前行的虫虫5 小时前
nicegui文件上传归纳
python
一个没有本领的人6 小时前
UIU-Net运行记录
python
国强_dev6 小时前
Python 的“非直接原因”报错
开发语言·python
副露のmagic6 小时前
更弱智的算法学习 day24
python·学习·算法
廖圣平6 小时前
从零开始,福袋直播间脚本研究【三】《多进程执行selenium》
python·selenium·测试工具