抽提5'UTR或者3'UTR序列,然后计算G4 score,最后一步计算G4 Score需要用到2023-10-18文章"G4Hunter 计算G4 score脚本"中的脚本
bash
#!/bin/bash
path02=/data/Users/liji/02_data/02_datablast/5_3UTR/Chinese_Spring/CS_2.1/Ta_Songhomogenous5utr
path03=/data/Users/liji/02_data/02_datablast/5_3UTR/Chinese_Spring/CS_2.1/Ta_Songhomogenous3utr
if [ -d ${path02} ]
then
rm -rf ${path02}
mkdir ${path02}
else
mkdir ${path02}
fi
if [ -f Ta_Song5utr_merge ]
then
rm Ta_Song5utr_merge
fi
less Song_transcriptome.gff3 | grep UTR5 | awk '{print $1,$4-1,$5,$6,$7,$9}' | cut -d ";" -f1 | awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6,$4,$5}' | sed -e 's/ID=//g'> Song_5utr.bed &&
bedtools getfasta -fi Song_transcriptome.fa -bed Song_5utr.bed -s -name -fo Song_5utr.output &&
less Song_5utr.output | cut -d "(" -f1 > Song_5utr.output1 &&
mv Song_5utr.output1 Song_5utr.output &&
bash realUTR_outputnumber01.sh &&
bash realUTR_changename01.sh &&
bash realUTR_extract_sequence01.sh &&
#bash realUTR_delete_one_gene01.sh &&
bash realUTR_mafft_alignment01.sh &&
bash realUTR_manylinestooneline01.sh
#bash realUTR_convert_to_0125_01.sh &&
#bash lin_lijipandas_sum_012_01.sh
path04=/data/Users/liji/02_data/02_datablast/5_3UTR/Chinese_Spring/CS_2.1/Ta_Songhomogenous5utr/*.txt
#path05=/data/Users/liji/02_data/02_datablast/5_3UTR/Chinese_Spring/Ta_homogenous3utr/*.txt
for i in $(ls ${path04})
do
cat $i >> Ta_Song5utr_merge
done
#for i in $(ls ${path04})
#do
# cat $i >> Ta_Song3utr_merge
#done
python lin_format_for_heatmap.py -f1 Ta_Song5utr_merge > Ta_Song5utr_merge1 &&
#python lin_format_for_heatmap.py -f1 Ta_Song3utr_merge > SongTa_3utr_merge1 &&
python lin_extract_G4.py -f1 Ta_Song5utr_merge1 > Ta_Song5utr_merge12 &&
Rscript lin_rg4_wide_data2long_data.R -i Ta_Song5utr_merge12 -o Ta_Song5utr_merge123
#Rscript lin_calculate_rg4score.R -i SongTa_5utr_merge123 -o SongTa_5utr_merge1234 &&
#rm Ta_Song5utr_merge1 &&
#rm Ta_Song5utr_merge12 &&
#rm Ta_Song5utr_merge123 &&
#rm Ta_Song5utr_merge
lin_format_for_heatmap.py如下:
python
import argparse
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="the original file,tabulated,make sure do not contain blank line")
args = parser.parse_args()
seq={}
# b=open("ztotgen31spAT1G01120.1-1-1-1.txt", 'w')
with open (args.file1,"r") as f1:
#下面的这个for循环其实就是一个生成字典的过程,需要仔细品味,领悟
for line in f1:
if line.startswith('>'): #判断字符串是否以">"开始
name=line.strip().split('>')[1] #以>为分隔符,并取序列为1的项,也就是不带有>的,基因的名字
# name=line.strip().split()[0] #以空格为分隔符,并取序列为0的项,也就是带有>的基因的名字
seq[name]=''
else:
seq[name]+=line.replace('\n','')#str.replace(old, new[, max]);old -- 将被替换的子字符串;new -- 新字符串,用于替换old子字符串;max -- 可选字符串, 替换不超过 max 次
# print(seq)
# print(seq.keys())
# a.close()
for i in seq.keys():
print(i+"\t"+seq[i])
# b.write(i+"\t"+seq[i]+"\n")#注意windows上面需要用加上"\n",当在linux上运行时,不用加"\n"
# f.close()
lin_extract_G4.py 如下:
python
#! /usr/bin/env python
#usage: python hash-always.py -f1 1.txt -f2 2.txt > out.txt
import argparse
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="input1")
args = parser.parse_args()
import re
out=open("Ta_3utr_merge1out","w")
import re
G3="G{3}[ATCG]{1,15}G{3}[ATCG]{1,15}G{3}[ATCG]{1,15}G{3}"
G2="G{2}[ATCG]{1,9}G{2}[ATCG]{1,9}G{2}[ATCG]{1,9}G{2}"
G3B1="G{3}[ATCG]{1,9}G{2}[ATC]G[ATCG]{1,9}G{3}[ATCG]{1,9}G{3}"
G3B2="G{3}[ATCG]{1,9}G{3}[ATCG]{1,9}G{2}[ATC]G[ATCG]{1,9}G{3}"
G3V1="G{2}[ATCG]{1,9}G{3}[ATCG]{1,9}G{3}[ATCG]{1,9}G{3}"
G3V2="G{3}[ATCG]{1,9}G{3}[ATCG]{1,9}G{3}[ATCG]{1,9}G{2}"
with open(args.file1,"r") as f1:
for i in f1:
i=i.strip().split()
G3out=re.findall(G3,str(i[1]))
G2out=re.findall(G2,str(i[1]))
G3B1out=re.findall(G3B1,str(i[1]))
G3B2out=re.findall(G3B2,str(i[1]))
G3V1out=re.findall(G3V1,str(i[1]))
G3V2out=re.findall(G3V2,str(i[1]))
# out.write(str("G2")+" "+i[0]+" "+",".join(G2out)+"\n")
# out.write(str("G3")+" "+i[0]+" "+",".join(G3out)+"\n")
# out.write(str("G3B1")+" "+i[0]+" "+",".join(G3B1out)+"\n")
# out.write(str("G3B2")+" "+i[0]+" "+",".join(G3B2out)+"\n")
# out.write(str("G3V1")+" "+i[0]+" "+",".join(G3V1out)+"\n")
# out.write(str("G3V2")+" "+i[0]+" "+",".join(G3V2out)+"\n")
print(str("G2"),i[0],",".join(G2out))
print(str("G3"),i[0],",".join(G3out))
print(str("G3B1"),i[0],",".join(G3B1out))
print(str("G3B2"),i[0],",".join(G3B2out))
print(str("G3V1"),i[0],",".join(G3V1out))
print(str("G3V2"),i[0],",".join(G3V2out))
lin_rg4_wide_data2long_data.R如下:
r
library(tidyr)
#library(stringr)
library(getopt)
command=matrix(c(
'help','h',0,'loical','显示此帮助信息',
'input','i',1,'character','输入文件',
'output','o',2,'character','输出文件'),
byrow=T, ncol=5
)
args=getopt(command)
# 当未提供参数显示帮助信息
if (!is.null(args$help) || is.null(args$input)) {
cat(paste(getopt(command, usage = T), "\n"))
q(status=1)
}
# 设置默认值
if ( is.null(args$output)) {
args$output = "output.txt"
}
df<-read.table(args$input,sep=" ",header=FALSE,fill=TRUE)
# GO_Gene<-Func_Anno[,c(2,10)]
#去掉空值行(基因对应GO id为空)
colnames(df)<-c("Type","ID","Sequence")
df<-df[-which(df$Sequence==""),]
#合并两列数据
df<-unite(df, "ID01", Type, ID,sep="_")
#拆分GO列数据(Note:一个逗号一个GO)
df<-separate(df,col=Sequence,sep =',',remove = TRUE,into=as.character(c(1:20)))
#检查GO是否完全分开
which(!is.na(df[,21])) #判断最后一列是否有非NA值
sum(is.na(df[,21])) #判断最后一列为NA值行数是否与矩阵行一样
#删除全为NA的列
df<-df[,-which(apply(df,2,function(x) all(is.na(x))))]
#按行合并
df01<-data.frame(matrix(NA,300000,2))
for(i in 2:ncol(df)){
if(i==2){
tmp<-as.matrix(df[,c(1,i)])
df01<-tmp
}else{
tmp<-as.matrix(df[,c(1,i)])
df01<-rbind(df01,tmp)
}
}
colnames(df01)<-c("Id","Sequence")
write.table(df01,file=args$output,quote = F,row.names = F,