python之函数返回数据框

1.原始文件

bash 复制代码
##gff-version 3
Chr1A   IWGSC_v2.1      gene    40098   70338   33      -       .       ID=TraesCS1A03G0000200;previous_id=TraesCS1A02G000100;primconf=HC;Name=TraesCS1A03G0000200;cds=CDS_OK;mapping=fullMatchWithMissmatches
Chr1A   IWGSC_v2.1      mRNA    40098   70338   .       -       .       ID=TraesCS1A03G0000200.1;Parent=TraesCS1A03G0000200;Note=TraesCS1A01G000100;primconf=HC;Name=TraesCS1A03G0000200.1;secconf=HC2;cds=CDS_OK;mapping=fullMatchWithMissmatches;previous_id=TraesCS1A02G000100.1
Chr1A   IWGSC_v2.1      three_prime_UTR 40098   40731   .       -       .       ID=TraesCS1A03G0000200.1.utr3p1;Parent=TraesCS1A03G0000200.1
Chr1A   IWGSC_v2.1      exon    40098   40731   .       -       .       ID=TraesCS1A03G0000200.1.exon1;Parent=TraesCS1A03G0000200.1
Chr1A   IWGSC_v2.1      three_prime_UTR 58474   58507   .       -       .       ID=TraesCS1A03G0000200.1.utr3p2;Parent=TraesCS1A03G0000200.1
Chr1A   IWGSC_v2.1      exon    58474   58897   .       -       .       ID=TraesCS1A03G0000200.1.exon2;Parent=TraesCS1A03G0000200.1
Chr1A   IWGSC_v2.1      CDS     58508   58768   .       -       0       ID=TraesCS1A03G0000200.1.CDS1;Parent=TraesCS1A03G0000200.1
Chr1A   IWGSC_v2.1      five_prime_UTR  58769   58897   .       -       .       ID=TraesCS1A03G0000200.1.utr5p1;Parent=TraesCS1A03G0000200.1
Chr1A   IWGSC_v2.1      exon    70089   70338   .       -       .       ID=TraesCS1A03G0000200.1.exon3;Parent=TraesCS1A03G0000200.1
Chr1A   IWGSC_v2.1      five_prime_UTR  70089   70338   .       -       .       ID=TraesCS1A03G0000200.1.utr5p2;Parent=TraesCS1A03G0000200.1
Chr1A   IWGSC_v2.1      gene    70239   89245   35      +       .       ID=TraesCS1A03G0000400;previous_id=TraesCS1A02G000200;primconf=HC;Name=TraesCS1A03G0000400;cds=CDS_OK;mapping=fullPerfectMatch
Chr1A   IWGSC_v2.1      mRNA    70239   89245   .       +       .       ID=TraesCS1A03G0000400.1;Parent=TraesCS1A03G0000400;Note=TraesCS1A01G000200;primconf=HC;Name=TraesCS1A03G0000400.1;secconf=HC2;cds=CDS_OK;mapping=fullPerfectMatch;previous_id=TraesCS1A02G000200.1

2.目的文件:

bash 复制代码
TraesCS1A02G002100      bad
TraesCS1A02G002200      bad
TraesCS1A02G002900      bad
TraesCS1A02G003200      bad
TraesCS1A02G003300      bad
TraesCS1A02G003700      good
TraesCS1A02G003900      bad
TraesCS1A02G004100      bad
TraesCS1A02G004300      bad
TraesCS1A02G004700      bad
TraesCS1A02G004800      bad
TraesCS1A02G004900      good
TraesCS1A02G005700      good

3.代码:

python 复制代码
# lin_whether_1st_intron_longest.py
#! /usr/bin/env python
#统计第一个内含子是否是最长的,如果是输出good,否则输出bad
#usage: python lin_whether_1st_intron_longest.py Ta_genomeplus.gff5-1 > Ta_genomeplus.gff5-120824
#usage: python lin_whether_1st_intron_longest.py Ta_genomeminus.gff5-1 > Ta_genomeminus.gff5-120824
import pandas as pd

def outputlist1(f1):
    list1 = []
    list2 = []
    list21 = []
    df1=pd.read_table(f1,index_col=0)
    df1["id1"]=df1.index
    df1["id1_start_end"]=df1["id1"]+","+df1["start"].astype(str)+","+df1["end"].astype(str)

    df2=df1.iloc[:,[3]]
    df3 = df2.groupby("id").apply(lambda x: x["id1_start_end"].tolist())
    for i in range(len(df3)):

        list1.append(",".join(df3[i]))

    # print(list1)

    for i in list1:
        i=i.strip().split(",")
        # " ".join(i)
        if len(i)>=9:
            for j in range(2,len(i)-1,3):
                list2.append(i[0])
                list2.append(str(int(i[j+2])-int(i[j])-1))
        else:
            continue
    # print(list2)
    # b=open("47out2.txt","w")
    # def output_length(list02):
    # b.write("id" + "\t" + "length"+"\n")
    for i in range(len(list2)):
        # list2[i]=list2[i].strip().split()
        # print(list2[i])
        if i % 2 == 0:
    #         # b.write(str(list2[i])+"\t"+str(list2[i+1])+"\t"+str(list2[i+2])+"\n")
    #         # print(str(list2[i+1])+"\t"+str(list2[i+2])+"\t"+str(list2[i]))
    #         b.write(str(list2[i])+"\t"+str(list2[i+1])+"\n")
            list21.append(list2[i:i+2])
    return list21


def outputlist2(f2):
    list31 = []
    list32 = []
    list4 = []
    list41=[]
    df21 = pd.DataFrame(f2)
    # df21=pd.read_table(df21)
    df21.columns = ['id', 'length']
    df21=df21.set_index("id")
    # df21=f2
    df21["id1"]=df21.index
    df21["length1"]=df21["length"].astype(str)
    df21["id1_length"]=df21["id1"]+","+df21["length"].astype(str)


    df22=df21.iloc[:,[2]]
    df23 = df22.groupby("id").apply(lambda x: x["length1"].tolist())
    for i in range(len(df23)):
        list31.append(",".join(df23[i]))
    #新的数据框2
    df32=df21.iloc[:,[3]]
    df33 = df32.groupby("id").apply(lambda x: x["id1_length"].tolist())
    # print(df23)
    for i in range(len(df33)):
        list32.append(",".join(df33[i]))
    # list3=[int(i) for i in list3]
    # print(list32)
    # print(list3[1])
    # print(max(list3[1]))
    for i in range(len(list31)):
        list32[i] = list32[i].strip().split(",")
        list31[i]=list31[i].strip().split(",")
        list31[i]=[int(k) for k in list31[i]]
        # print(max(list3[i]))
        # for j in range(len(list3[i])):
        if list31[i][0]==max((list31[i])):
            list4.append(list32[i][0])
            list4.append("good")
        else:
            list4.append(list32[i][0])
            list4.append("bad")
    for i in range(len(list4)):
        if i % 2==0:
            print(str(list4[i])+"\t"+str(list4[i+1]))

def outputlist3(f2):
    list31 = []
    list32 = []
    list4 = []
    list41=[]
    df21 = pd.DataFrame(f2)
    # df21=pd.read_table(df21)
    df21.columns = ['id', 'length']
    df21=df21.set_index("id")
    # df21=f2
    df21["id1"]=df21.index
    df21["length1"]=df21["length"].astype(str)
    df21["id1_length"]=df21["id1"]+","+df21["length"].astype(str)


    df22=df21.iloc[:,[2]]
    df23 = df22.groupby("id").apply(lambda x: x["length1"].tolist())
    for i in range(len(df23)):
        list31.append(",".join(df23[i]))
    #新的数据框2
    df32=df21.iloc[:,[3]]
    df33 = df32.groupby("id").apply(lambda x: x["id1_length"].tolist())
    # print(df23)
    for i in range(len(df33)):
        list32.append(",".join(df33[i]))
    # list3=[int(i) for i in list3]
    # print(list32)
    # print(list3[1])
    # print(max(list3[1]))
    for i in range(len(list31)):
        list32[i] = list32[i].strip().split(",")
        list31[i]=list31[i].strip().split(",")
        list31[i]=[int(k) for k in list31[i]]
        # print(max(list3[i]))
        # for j in range(len(list3[i])):
        if list31[i][0]==max((list31[i])):
            list4.append(list32[i][0])
            list4.append("good")
        else:
            list4.append(list32[i][0])
            list4.append("bad")
    for i in range(len(list4)):
        if i % 2==0:
            print(str(list4[i])+"\t"+str(list4[i+1]))
import argparse
import os
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="the original file,tabulated,make sure do not contain blank line")
args = parser.parse_args()
with open (args.file1,"r") as f1:
    if "plus" in args.file1:
        df01=outputlist1(args.file1)
        outputlist2(df01)
    else:
        df01=outputlist1(args.file1)
        outputlist3(df01)
相关推荐
itwangyang5201 天前
AIDD-人工智能药物设计-字节跳动 PXDesign:AI 设计蛋白,82% 命中率惊艳业界
人工智能·python
Biehmltym1 天前
【AI】01开发环境:Conda_python包/环境管理,10分钟上手
开发语言·python·conda
袁气满满~_~1 天前
Python练习
开发语言·python
我可以将你更新哟1 天前
【爬虫】爬取斗罗大陆漫画,面向对象封装(存入数据库)
数据库·爬虫·python
麦麦大数据1 天前
F060 基于BERTvue+flask电影评论情感分析系统
后端·python·flask·bert·推荐算法·情感分析·电影评论
yongche_shi1 天前
第八十九篇:CAP理论、BASE理论在系统设计中的应用
开发语言·python·面试宝典·cap理论·base理论
小智RE0-走在路上1 天前
Python学习笔记(13) --Mysql,Python关联数据库
数据库·python·学习
YJlio1 天前
杨利杰YJlio|博客导航目录(专栏总览 + 推荐阅读路线)
开发语言·python·pdf
Swizard1 天前
数据不够代码凑?用 Albumentations 让你的 AI 模型“看”得更广,训练快 10 倍!
python·算法·ai·训练
智算菩萨1 天前
【Python机器学习】决策树与随机森林:解释性与鲁棒性的平衡
python·决策树·机器学习