python脚本过滤得到non-overlap的utr

使用该脚本对上述的结果"lin_20240321_calculating_rG4score.R"进行过滤

python 复制代码
import csv

def read_file(file_path):
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file, delimiter='\t')
        return list(reader)

def process_sequences(data):
    gene_sequences = {}
    for row in data:
        gene_id = row['Id']
        start = int(row['Start'])
        end = int(row['End'])
        length=int(row['total_length'])
        score = float(row['G4Hscore'])

        if gene_id not in gene_sequences:
            gene_sequences[gene_id] = []

        gene_sequences[gene_id].append({
            'Type': row['Type'],
            'Start': start,
            'End': end,
            'Length': length,
            'Sequence': row['Sequence'],
            'Score': score
        })

    # 对每个基因的序列按分数降序排序
    for gene_id, sequences in gene_sequences.items():
        gene_sequences[gene_id] = sorted(sequences, key=lambda x: x['Score'], reverse=True)

    # 保留分数最高且不重叠的序列
    final_selection = {}
    for gene_id, sequences in gene_sequences.items():
        final_selection[gene_id] = []
        for seq in sequences:
            if not any(seq['Start'] < s['End'] and seq['End'] > s['Start'] for s in final_selection[gene_id]):
                final_selection[gene_id].append(seq)

    return final_selection

def write_results(gene_sequences, output_file):
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(['Id', 'Type', 'Start', 'End', 'Total_length','Sequence', 'Score'])
        for gene_id, sequences in gene_sequences.items():
            for seq in sequences:
                writer.writerow([gene_id, seq['Type'], seq['Start'], seq['End'], seq['Length'], seq['Sequence'], seq['Score']])

# 输入和输出文件路径
#usage:python lin_filter_non-overlap_rg4.py -f1 lijinonextended_3utr_allrg4output1.fasta -f2 lijinonextended_3utr_allrg4output2.fasta
import argparse
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="input1")
parser.add_argument("-f2","--file2",help="input2")
args = parser.parse_args()

# 读取文件
data = read_file(args.file1)
# 处理序列,保留得分最高且不重叠的序列
gene_sequences = process_sequences(data)
# 将结果写入新文件
write_results(gene_sequences, args.file2)
相关推荐
岑梓铭31 分钟前
(CentOs系统虚拟机)Standalone模式下安装部署“基于Python编写”的Spark框架
linux·python·spark·centos
游客5201 小时前
opencv中的各种滤波器简介
图像处理·人工智能·python·opencv·计算机视觉
Eric.Lee20211 小时前
moviepy将图片序列制作成视频并加载字幕 - python 实现
开发语言·python·音视频·moviepy·字幕视频合成·图像制作为视频
Dontla1 小时前
vscode怎么设置anaconda python解释器(anaconda解释器、vscode解释器)
ide·vscode·python
qq_529025292 小时前
Torch.gather
python·深度学习·机器学习
数据小爬虫@2 小时前
如何高效利用Python爬虫按关键字搜索苏宁商品
开发语言·爬虫·python
Cachel wood2 小时前
python round四舍五入和decimal库精确四舍五入
java·linux·前端·数据库·vue.js·python·前端框架
終不似少年遊*2 小时前
pyecharts
python·信息可视化·数据分析·学习笔记·pyecharts·使用技巧
Python之栈2 小时前
【无标题】
数据库·python·mysql
袁袁袁袁满2 小时前
100天精通Python(爬虫篇)——第113天:‌爬虫基础模块之urllib详细教程大全
开发语言·爬虫·python·网络爬虫·爬虫实战·urllib·urllib模块教程