使用该脚本对上述的结果"lin_20240321_calculating_rG4score.R"进行过滤
python
import csv
def read_file(file_path):
with open(file_path, 'r') as file:
reader = csv.DictReader(file, delimiter='\t')
return list(reader)
def process_sequences(data):
gene_sequences = {}
for row in data:
gene_id = row['Id']
start = int(row['Start'])
end = int(row['End'])
length=int(row['total_length'])
score = float(row['G4Hscore'])
if gene_id not in gene_sequences:
gene_sequences[gene_id] = []
gene_sequences[gene_id].append({
'Type': row['Type'],
'Start': start,
'End': end,
'Length': length,
'Sequence': row['Sequence'],
'Score': score
})
# 对每个基因的序列按分数降序排序
for gene_id, sequences in gene_sequences.items():
gene_sequences[gene_id] = sorted(sequences, key=lambda x: x['Score'], reverse=True)
# 保留分数最高且不重叠的序列
final_selection = {}
for gene_id, sequences in gene_sequences.items():
final_selection[gene_id] = []
for seq in sequences:
if not any(seq['Start'] < s['End'] and seq['End'] > s['Start'] for s in final_selection[gene_id]):
final_selection[gene_id].append(seq)
return final_selection
def write_results(gene_sequences, output_file):
with open(output_file, 'w', newline='') as file:
writer = csv.writer(file, delimiter='\t')
writer.writerow(['Id', 'Type', 'Start', 'End', 'Total_length','Sequence', 'Score'])
for gene_id, sequences in gene_sequences.items():
for seq in sequences:
writer.writerow([gene_id, seq['Type'], seq['Start'], seq['End'], seq['Length'], seq['Sequence'], seq['Score']])
# 输入和输出文件路径
#usage:python lin_filter_non-overlap_rg4.py -f1 lijinonextended_3utr_allrg4output1.fasta -f2 lijinonextended_3utr_allrg4output2.fasta
import argparse
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="input1")
parser.add_argument("-f2","--file2",help="input2")
args = parser.parse_args()
# 读取文件
data = read_file(args.file1)
# 处理序列,保留得分最高且不重叠的序列
gene_sequences = process_sequences(data)
# 将结果写入新文件
write_results(gene_sequences, args.file2)