1.原始文件
bash
##gff-version 3
Chr1A IWGSC_v2.1 gene 40098 70338 33 - . ID=TraesCS1A03G0000200;previous_id=TraesCS1A02G000100;primconf=HC;Name=TraesCS1A03G0000200;cds=CDS_OK;mapping=fullMatchWithMissmatches
Chr1A IWGSC_v2.1 mRNA 40098 70338 . - . ID=TraesCS1A03G0000200.1;Parent=TraesCS1A03G0000200;Note=TraesCS1A01G000100;primconf=HC;Name=TraesCS1A03G0000200.1;secconf=HC2;cds=CDS_OK;mapping=fullMatchWithMissmatches;previous_id=TraesCS1A02G000100.1
Chr1A IWGSC_v2.1 three_prime_UTR 40098 40731 . - . ID=TraesCS1A03G0000200.1.utr3p1;Parent=TraesCS1A03G0000200.1
Chr1A IWGSC_v2.1 exon 40098 40731 . - . ID=TraesCS1A03G0000200.1.exon1;Parent=TraesCS1A03G0000200.1
Chr1A IWGSC_v2.1 three_prime_UTR 58474 58507 . - . ID=TraesCS1A03G0000200.1.utr3p2;Parent=TraesCS1A03G0000200.1
Chr1A IWGSC_v2.1 exon 58474 58897 . - . ID=TraesCS1A03G0000200.1.exon2;Parent=TraesCS1A03G0000200.1
Chr1A IWGSC_v2.1 CDS 58508 58768 . - 0 ID=TraesCS1A03G0000200.1.CDS1;Parent=TraesCS1A03G0000200.1
Chr1A IWGSC_v2.1 five_prime_UTR 58769 58897 . - . ID=TraesCS1A03G0000200.1.utr5p1;Parent=TraesCS1A03G0000200.1
Chr1A IWGSC_v2.1 exon 70089 70338 . - . ID=TraesCS1A03G0000200.1.exon3;Parent=TraesCS1A03G0000200.1
Chr1A IWGSC_v2.1 five_prime_UTR 70089 70338 . - . ID=TraesCS1A03G0000200.1.utr5p2;Parent=TraesCS1A03G0000200.1
Chr1A IWGSC_v2.1 gene 70239 89245 35 + . ID=TraesCS1A03G0000400;previous_id=TraesCS1A02G000200;primconf=HC;Name=TraesCS1A03G0000400;cds=CDS_OK;mapping=fullPerfectMatch
Chr1A IWGSC_v2.1 mRNA 70239 89245 . + . ID=TraesCS1A03G0000400.1;Parent=TraesCS1A03G0000400;Note=TraesCS1A01G000200;primconf=HC;Name=TraesCS1A03G0000400.1;secconf=HC2;cds=CDS_OK;mapping=fullPerfectMatch;previous_id=TraesCS1A02G000200.1
2.目的文件:
bash
TraesCS1A02G002100 bad
TraesCS1A02G002200 bad
TraesCS1A02G002900 bad
TraesCS1A02G003200 bad
TraesCS1A02G003300 bad
TraesCS1A02G003700 good
TraesCS1A02G003900 bad
TraesCS1A02G004100 bad
TraesCS1A02G004300 bad
TraesCS1A02G004700 bad
TraesCS1A02G004800 bad
TraesCS1A02G004900 good
TraesCS1A02G005700 good
3.代码:
python
# lin_whether_1st_intron_longest.py
#! /usr/bin/env python
#统计第一个内含子是否是最长的,如果是输出good,否则输出bad
#usage: python lin_whether_1st_intron_longest.py Ta_genomeplus.gff5-1 > Ta_genomeplus.gff5-120824
#usage: python lin_whether_1st_intron_longest.py Ta_genomeminus.gff5-1 > Ta_genomeminus.gff5-120824
import pandas as pd
def outputlist1(f1):
list1 = []
list2 = []
list21 = []
df1=pd.read_table(f1,index_col=0)
df1["id1"]=df1.index
df1["id1_start_end"]=df1["id1"]+","+df1["start"].astype(str)+","+df1["end"].astype(str)
df2=df1.iloc[:,[3]]
df3 = df2.groupby("id").apply(lambda x: x["id1_start_end"].tolist())
for i in range(len(df3)):
list1.append(",".join(df3[i]))
# print(list1)
for i in list1:
i=i.strip().split(",")
# " ".join(i)
if len(i)>=9:
for j in range(2,len(i)-1,3):
list2.append(i[0])
list2.append(str(int(i[j+2])-int(i[j])-1))
else:
continue
# print(list2)
# b=open("47out2.txt","w")
# def output_length(list02):
# b.write("id" + "\t" + "length"+"\n")
for i in range(len(list2)):
# list2[i]=list2[i].strip().split()
# print(list2[i])
if i % 2 == 0:
# # b.write(str(list2[i])+"\t"+str(list2[i+1])+"\t"+str(list2[i+2])+"\n")
# # print(str(list2[i+1])+"\t"+str(list2[i+2])+"\t"+str(list2[i]))
# b.write(str(list2[i])+"\t"+str(list2[i+1])+"\n")
list21.append(list2[i:i+2])
return list21
def outputlist2(f2):
list31 = []
list32 = []
list4 = []
list41=[]
df21 = pd.DataFrame(f2)
# df21=pd.read_table(df21)
df21.columns = ['id', 'length']
df21=df21.set_index("id")
# df21=f2
df21["id1"]=df21.index
df21["length1"]=df21["length"].astype(str)
df21["id1_length"]=df21["id1"]+","+df21["length"].astype(str)
df22=df21.iloc[:,[2]]
df23 = df22.groupby("id").apply(lambda x: x["length1"].tolist())
for i in range(len(df23)):
list31.append(",".join(df23[i]))
#新的数据框2
df32=df21.iloc[:,[3]]
df33 = df32.groupby("id").apply(lambda x: x["id1_length"].tolist())
# print(df23)
for i in range(len(df33)):
list32.append(",".join(df33[i]))
# list3=[int(i) for i in list3]
# print(list32)
# print(list3[1])
# print(max(list3[1]))
for i in range(len(list31)):
list32[i] = list32[i].strip().split(",")
list31[i]=list31[i].strip().split(",")
list31[i]=[int(k) for k in list31[i]]
# print(max(list3[i]))
# for j in range(len(list3[i])):
if list31[i][0]==max((list31[i])):
list4.append(list32[i][0])
list4.append("good")
else:
list4.append(list32[i][0])
list4.append("bad")
for i in range(len(list4)):
if i % 2==0:
print(str(list4[i])+"\t"+str(list4[i+1]))
def outputlist3(f2):
list31 = []
list32 = []
list4 = []
list41=[]
df21 = pd.DataFrame(f2)
# df21=pd.read_table(df21)
df21.columns = ['id', 'length']
df21=df21.set_index("id")
# df21=f2
df21["id1"]=df21.index
df21["length1"]=df21["length"].astype(str)
df21["id1_length"]=df21["id1"]+","+df21["length"].astype(str)
df22=df21.iloc[:,[2]]
df23 = df22.groupby("id").apply(lambda x: x["length1"].tolist())
for i in range(len(df23)):
list31.append(",".join(df23[i]))
#新的数据框2
df32=df21.iloc[:,[3]]
df33 = df32.groupby("id").apply(lambda x: x["id1_length"].tolist())
# print(df23)
for i in range(len(df33)):
list32.append(",".join(df33[i]))
# list3=[int(i) for i in list3]
# print(list32)
# print(list3[1])
# print(max(list3[1]))
for i in range(len(list31)):
list32[i] = list32[i].strip().split(",")
list31[i]=list31[i].strip().split(",")
list31[i]=[int(k) for k in list31[i]]
# print(max(list3[i]))
# for j in range(len(list3[i])):
if list31[i][0]==max((list31[i])):
list4.append(list32[i][0])
list4.append("good")
else:
list4.append(list32[i][0])
list4.append("bad")
for i in range(len(list4)):
if i % 2==0:
print(str(list4[i])+"\t"+str(list4[i+1]))
import argparse
import os
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="the original file,tabulated,make sure do not contain blank line")
args = parser.parse_args()
with open (args.file1,"r") as f1:
if "plus" in args.file1:
df01=outputlist1(args.file1)
outputlist2(df01)
else:
df01=outputlist1(args.file1)
outputlist3(df01)