安装

bash 复制代码

cd Software
git clone https://github.com/skystreet8/BEAUT
cd BEAUT
conda env create -f environment.yml
conda activate beaut
# 配置环境
mamba install python=3.10 diamond=2.1.9 -c bioconda
mamba install nomkl -y
pip install torch --index-url https://download.pytorch.org/whl/cpu
pip install fair-esm
tar -zxvf nltk_data.tar.gz
cp -r nltk_data/ ~/
#
# 下载模型文件 https://zenodo.org/records/15388149 解压得到data/和data_augmentation/
# 解压后 将 data/ 文件夹放置在 BEAUT/ 目录下
# 将原来有的 data_augmentation_data/ 文件夹放置在 BEAUT/data_augmentation/ 下，然后将 BEAUT/data_augmentation/data_augmentation_data/ 目录重命名为 BEAUT/data_augmentation/data/
# 下载其他文件 https://zenodo.org/records/15388149
# 将 results.tar.gz 文件放入 BEAUT/data_augmentation/PocketMatch/ 目录并解压

运行

BEAUT/esm/esm-extract.sh 修改一下这个文件

bash 复制代码

#export CUDA_VISIBLE_DEVICES=0

#python scripts/extract.py esm2_t33_650M_UR50D $1 $2 --repr_layers 33 --include mean --toks_per_batch 2048
################### up raw #######################
# 1. 禁用 CUDA 变量（或者直接删掉这一行）
# export CUDA_VISIBLE_DEVICES=0

# 2. 设置 CPU 线程优化（充分利用你的 92 核）
# 建议不要全占满，设置 80 个核心用于计算，留一点给系统
export OMP_NUM_THREADS=80
export MKL_NUM_THREADS=80

# 3. 修改 python 命令
# - 添加 --device cpu (强制使用 CPU)
# - 调大 --toks_per_batch (因为你有 900G 内存，可以显著调大以提升并行度)
python scripts/extract.py esm2_t33_650M_UR50D $1 $2 \
    --repr_layers 33 \
    --include mean \
    --toks_per_batch 16384 \
    --nogpu

准备 BEAUT/scripts/filter_non_enzymes_3.0.py

python 复制代码

import re
import argparse
import pandas as pd
from functools import partial
from nltk import word_tokenize
import logging
from utils import *

# Logging Configuration
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger('PreFilter')

def get_parser():
    parser = argparse.ArgumentParser(description='Pre-filter sequences using eggNOG before BEAUT prediction')
    parser.add_argument('-f', '--faa', required=True, help='Input protein sequence file (.fasta/.faa)')
    parser.add_argument('-e', '--egg', required=True, help='eggNOG annotation file (.annotations)')
    parser.add_argument('-o', '--out_prefix', required=True, help='Output file prefix')
    return parser

def main():
    args = get_parser().parse_args()
    
    # Define eggNOG reader
    read_tsv = partial(pd.read_csv, sep='\t', comment='#',
                       names=['query', 'seed_ortholog', 'evalue', 'score','eggNOG_OGs', 'max_annot_lvl', 'COG_category', 'Description',
                              'Preferred_name', 'GOs', 'EC', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass',
                              'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'PFAMs'])

    # Exclusion keywords (From original BEAUT logic)
    filter_ase_words = {'permease', 'peptidase', 'helicase', 'exonuclease', 'atpases', 'heparinase', 'dipeptidyl-peptidase',
                        'dextransucrase', 'sortase', 'endonuclease', 'aminopeptidase', 'oligopeptidase',
                        'oligoendopeptidase', 'carboxypeptidase', 'permeases', 'caspase', 'primase', 'atpase',
                        'pip-5-kinases', "5'-nucleotidase", 'ceramidase', 'metallopeptidase', 'nuclease', 'melibiase',
                        'trehalase', 'amylase', 'xyloglucanase', 'lectin/glucanases', 'proteinase', 'collagenase',
                        'ribonuclease', 'calcium-release', 'terminase', 'excisionase', 'metalloprotease', 'polymerase',
                        'topoisomerase', 'd-transpeptidase', 'depolymerase', 'beta-glucanase', 'elastase',
                        'alpha-trehalase', 'phospholipase_d-nuclease', 'metallocarboxypeptidase', 'gtpase', 'invertase',
                        'endonuclease/exonuclease/phosphatase', 'aminopeptidases', 'acylaminoacyl-peptidases', 'telomerase',
                        'metallo-peptidase', 'replicase', 'fbpase', 'gyrase', 'endo-isopeptidase', 'endoglucanase',
                        '1,4-beta-xylanase', 'hydrogenlyase', 'carnosinase', 'polygalactosaminidase', 'cam-kinase',
                        'chitinase', 'chondroitinase', 'metalloproteinase', 'cyclo-malto-dextrinase', 'gamma-secretase',
                        'metallo-endopeptidase', 'fe-hydrogenase', 'f0f1-atpase', 'trnase', 'l-aminopeptidase', 'dutpase',
                        'inj_translocase', 'ntpase', 'pectinesterase', 'metallo-endoribonuclease', 'exosortase',
                        'beta-1,3-glucanase', 'dgtpase', 'carboxypetidase', 'transcriptase',
                        'protein-phosphocysteine-l-ascorbate-phosphotransferase', 'exoribonuclease', 'coagulase',
                        'recombinase', 'atcase', 'hydrogenase', 'interferase', 'nadase', 'endosialidase', 'nucleases',
                        'transposases', 'dipeptidase', 'nickase/helicase', 'rnase', 'dnase/trnase', '-atpase', 'coprotease',
                        'reactivase', 'metal-chelatase', 'translocase', 'ferrochelatase', 'v-atpase', 'dnase', 'scramblase',
                        'host-nuclease', 'peptidoglycan-synthase', 'chitosanase', 'peptide-n-glycosidase',
                        'cobaltochelatase', 'exodeoxyribonuclease', 'disaggregatase', 'integrase', 'relaxase', 'maturase',
                        'exopeptidases', 'xylanase', 'insulinase', 'metalloendopeptidase', 'arch_atpase', 'de-polymerase',
                        'transposase', 'proteases', 'dapkinase', 'flippase', 'transpeptidase', 'barnase',
                        'polygalacturonase', 'n-atpase', 'ld-carboxypeptidase', 'endopeptidase', 'gyrase/topoisomerase',
                        'autokinase', 'catalase', 'tripeptidases', 'gtpases', 'activase', 'endo-1,4-beta-xylanase',
                        'amylopullulanase', 'aa_permease', 'metallo-carboxypeptidase', 'chelatase', 'aaa-atpase',
                        'protease', 'helicases', 'dna-methyltransferase', 'cellulase', 'endoribonuclease', 'levanase',
                        'excinuclease', 'peptidases', 'convertase', 'apyrase', 'beta-xylanase', 'endodeoxyribonuclease',
                        'dl-endopeptidase', 'lecithinase', 'resolvase', 'primase/polymerase', 'primase-helicase',
                        'cutinase', '1,4-beta-cellobiosidase', 'dipeptidylpeptidase', 'd-aminopeptidase',
                        'amylopullulanase'}

    # 1. Load eggNOG Data
    logger.info(f'Loading eggNOG annotations from {args.egg}...')
    df = read_tsv(args.egg)
    
    # 2. Core Filtering Logic
    df_with_ec = df.query('EC != "-"').copy()
    df_wo_ec = df.query('EC == "-"').copy()
    df_wo_desc = df_wo_ec.query('Description == "-"').copy()
    df_with_desc = df_wo_ec.query('Description != "-"').copy()

    keep_ase_indexes = []
    for t in df_with_desc.itertuples():
        desc = t[8].lower()
        words = word_tokenize(desc)
        if not any([w.endswith('ase') or w.endswith('ases') for w in words]):
            continue
        for w in words:
            if w in filter_ase_words:
                break
        else:
            keep_ase_indexes.append(t[0])
    
    ase_df = df_with_desc.loc[keep_ase_indexes].copy()
    
    # Recover uncharacterized/unknown functions
    remove_indexes = [i for i in df_with_desc.index if i not in set(keep_ase_indexes)]
    not_ase_df = df_with_desc.loc[remove_indexes].copy()
    recover_indexes = [t[0] for t in not_ase_df.itertuples() if any(x in t[8].lower() for x in ['unknown function', 'duf', 'uncharacterised']) or 'non supervised orthologous group' in t[8].lower()]
            
    recovered_df = not_ase_df.loc[recover_indexes].copy()
    
    # Final list of candidate IDs
    candidate_df = pd.concat([df_with_ec, ase_df, df_wo_desc, recovered_df], ignore_index=True)
    candidate_ids = set(candidate_df['query'].values.tolist())
    
    # 3. Filter FASTA and Save
    logger.info(f'Filtering FASTA file: {args.faa}...')
    headers, seqs = ReformatFastaFile(args.faa)
    
    filtered_headers = []
    filtered_seqs = []
    
    for h, s in zip(headers, seqs):
        clean_h = re.split(r'\s+', h)[0]
        if clean_h in candidate_ids:
            filtered_headers.append(h)
            filtered_seqs.append(s)
    
    faa_out = f"{args.out_prefix}_prefiltered.fasta"
    SaveFastaFile(faa_out, filtered_headers, filtered_seqs)
    
    logger.info(f'Original count: {len(headers)} | After filtering: {len(filtered_headers)}')
    logger.info(f'Filtered FASTA saved to: {faa_out}')

if __name__ == '__main__':
    main()

我的宏基因组非冗余基因集已经跑过eggnog了

bash 复制代码

python ${scripts_dir}/filter_non_enzymes_3.0.py \
	-f ${ref_protein_file} -e ${eggnog_result}/all_cds_ref_eggnog.emapper.annotations -o ${eggnog_result}/all_cds_ref_pre_BEAUT

把结果fasta文件放到 `BEAUT/esm 文件夹内，并去除*号`

bash 复制代码

sed '/>/!s/\*//g' ref_BEAUT_prefiltered.fasta > ref_BEAUT_clean.fasta #去除序列的*号

使用esm-2进行氨基酸序列预处理

bash 复制代码

#!/bin/bash

# 1. 路径定义
beaut_root="/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT"
# 建议进入 esm 目录执行，确保相对路径正确
cd ${beaut_root}/esm

# 2. 线程分配
export OMP_NUM_THREADS=20
export MKL_NUM_THREADS=20
export CUDA_VISIBLE_DEVICES="" 

# 3. 启动 8 路并行
echo "Starting 8-way parallel extraction..."

for i in {1..8}; do
    target_fasta="all_cds_ref_pre_BEAUT_clean.part_00${i}.fasta"
    
    # 每个进程拥有独立的输出目录，避免 I/O 冲突
    out_dir="../data/part${i}"
    mkdir -p ${out_dir}
    
    # 提交后台任务
    nohup bash ${beaut_root}/esm/esm-extract.sh ${target_fasta} ${out_dir} > p${i}.log 2>&1 &
    
    echo "Task ${i} submitted: ${target_fasta} -> ${out_dir}"
done

echo "-------------------------------------------------------"
echo "Check progress: tail -f p1.log"
echo "Monitor CPU: top (Each should be ~2000%)"
echo "-------------------------------------------------------"

进行esm-2结果合并

bash 复制代码

mkdir -p all_embeddings

for i in {1..8}; do
    for file in part${i}/embeddings_*.pt; do
        # 获取文件名，例如 embeddings_0.pt
        fname=$(basename "$file")
        # 移动并重命名为 part1_embeddings_0.pt 等
        mv "$file" "all_embeddings/part${i}_${fname}"
    done
done

cd all_embeddings
count=0
for f in *.pt; do
    mv "$f" "embeddings_${count}.pt"
    let count++
done

#### 进行预测 ######
cd BEAUT/scripts
python convert_embedding_chunks.py -f all_cds_ref_pre_BEAUT_prefiltered --multiple
python test_bulk.py -f all_cds_ref_pre_BEAUT_prefiltered

得到 data/all_cds_ref_pre_BEAUT_prefiltered_results_BEAUT_aug.pkl

构建 BEAUT/scripts/process_bulk_predictions_zp.py

bash 复制代码

import pandas as pd
import pickle
import os
from utils import ReadFastaFile, SaveFastaFile

# Define paths
result_file = '../data/all_cds_ref_pre_BEAUT_prefiltered_results_BEAUT_aug.pkl'
fasta_input = '../esm/all_cds_ref_pre_BEAUT_clean.fasta'
output_dir = '../data/all_cds_ref_pre_BEAUT_prefiltered_aug/'

# Check if output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load BEAUT prediction results
print("Loading prediction results...")
with open(result_file, 'rb') as f:
    results = pickle.load(f)

# Load original sequences
print("Reading FASTA file...")
headers, seqs = ReadFastaFile(fasta_input)

# Match scores with headers
# Note: results[h][1] is the probability of being an enzyme
print("Matching scores...")
scores = [results[h][1] for h in headers]

# Create DataFrame
df = pd.DataFrame({
    'header': headers, 
    'seq': seqs, 
    'pred_score': scores
})

# Sort by prediction score descending
df.sort_values('pred_score', ascending=False, inplace=True)

# Filter by threshold 0.5
pos_df = df.query('pred_score >= 0.5').copy()
print(f'{len(pos_df)} sequences were predicted to be positive.')

# Save results in chunks (100,000 sequences per file)
chunk_size = 100000
num_chunks = int(len(pos_df) // chunk_size) + 1

for i in range(1, num_chunks + 1):
    start_idx = (i - 1) * chunk_size
    end_idx = i * chunk_size
    sub_df = pos_df.iloc[start_idx:end_idx]
    
    if sub_df.empty:
        continue
        
    # Save CSV
    csv_name = f'{output_dir}positive_results_pt{i}.csv'
    sub_df.to_csv(csv_name, index=False)
    
    # Save FASTA
    fasta_name = f'{output_dir}positive_results_pt{i}.fasta'
    sub_headers = sub_df['header'].values.tolist()
    sub_seqs = sub_df['seq'].values.tolist()
    SaveFastaFile(fasta_name, sub_headers, sub_seqs)
    
    print(f'Saved chunk {i}: {csv_name} and {fasta_name}')

print("All processes completed successfully.")

python BEAUT/scripts/process_bulk_predictions_zp.py

得到 data/all_cds_ref_pre_BEAUT_prefiltered_aug/ 里面有 csv和fasta结果信息

长度过滤按照文献保留 157 到 1074 aa的序列

bash 复制代码

import pandas as pd
import os
import argparse

def read_fasta(path):
    headers, seqs = [], []
    with open(path, 'r') as f:
        current_seq = []
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith('>'):
                if current_seq:
                    seqs.append("".join(current_seq))
                headers.append(line[1:])
                current_seq = []
            else:
                current_seq.append(line)
        if current_seq:
            seqs.append("".join(current_seq))
    return headers, seqs

def main():
    parser = argparse.ArgumentParser(description='Sync filter for BEAUT CSV and FASTA files.')
    parser.add_argument('--input_dir', type=str, required=True, help='Directory with pt1-pt4 files')
    parser.add_argument('--min_len', type=int, default=157)
    parser.add_argument('--max_len', type=int, default=1074)
    args = parser.parse_args()

    # 1. Process and Filter CSV Data
    csv_files = sorted([f for f in os.listdir(args.input_dir) if f.startswith('positive_results_pt') and f.endswith('.csv')])
    print(f"Reading {len(csv_files)} CSV files...")
    
    csv_list = []
    for f in csv_files:
        temp_df = pd.read_csv(os.path.join(args.input_dir, f))
        csv_list.append(temp_df)
    
    full_df = pd.concat(csv_list, ignore_index=True)
    full_df['seq_len'] = full_df['seq'].str.len()
    
    # Apply length filter to DataFrame
    mask = (full_df['seq_len'] >= args.min_len) & (full_df['seq_len'] <= args.max_len)
    filtered_df = full_df[mask].copy()
    
    csv_out = os.path.join(args.input_dir, 'final_enzymes_filtered.csv')
    filtered_df.to_csv(csv_out, index=False)
    
    # 2. Process and Filter FASTA Data
    fasta_files = sorted([f for f in os.listdir(args.input_dir) if f.startswith('positive_results_pt') and f.endswith('.fasta')])
    print(f"Reading and syncing {len(fasta_files)} FASTA files...")
    
    # Use a set for high-speed header lookup
    valid_headers = set(filtered_df['header'].values)
    
    fasta_out = os.path.join(args.input_dir, 'final_enzymes_filtered.fasta')
    kept_count = 0
    
    with open(fasta_out, 'w') as out_f:
        for f in fasta_files:
            h_list, s_list = read_fasta(os.path.join(args.input_dir, f))
            for h, s in zip(h_list, s_list):
                if h in valid_headers:
                    out_f.write(f">{h}\n{s}\n")
                    kept_count += 1

    # Final Statistics
    print("-" * 40)
    print("Filtration Results Summary:")
    print(f"Total sequences input : {len(full_df)}")
    print(f"Sequences kept        : {len(filtered_df)}")
    print(f"Sequences removed     : {len(full_df) - len(filtered_df)}")
    print(f"Output CSV            : {csv_out}")
    print(f"Output FASTA          : {fasta_out}")
    print("-" * 40)

if __name__ == "__main__":
    main()

python BEAUT/scripts/length_filter.py python length_filter.py --input_dir data/all_cds_ref_pre_BEAUT_prefiltered_aug/

得到 all_cds_ref_pre_BEAUT_length_filtered.fasta

CLEAN 安装

Enzyme function prediction using contrastive learning | Science

GitHub - tttianhao/CLEAN: CLEAN: a contrastive learning model for high-quality functional prediction of proteins

bash 复制代码

cd ~/software

git clone https://github.com/tttianhao/CLEAN
cd CLEAN/app

conda create -n clean python=3.10 -y
conda activate clean

# 装 PyTorch（CPU 版示例）
mamba install pytorch==1.11.0 torchvision torchaudio cpuonly -c pytorch

# 其余 Python 依赖
pip install -r requirements.txt

# 把 CLEAN 自身也装进环境
python build.py install

# 下载 https://drive.google.com/file/d/1kwYd4VtzYuMvJMWXy6Vks91DSUAOcKpZ/view?usp=sharing
# 复制到data/
cd data
unzip pretrained.zip
# 重命名为解压结果为 pretrained/
cd ..
# 把length_filter.py结果的fasta放到data/inputs
nohup python CLEAN_infer_fasta.py --fasta_data all_cds_ref_pre_BEAUT_length_filtered

结果在 CLEAN/app/results/inputs

python 复制代码

import pandas as pd
from ec_utils import eval_ec, sort_ecs
import os

# --- 1. Load the single CLEAN results file ---
path_clean_maxsep = '/home/zhongpei/hard_disk_sda2/zhongpei/Software/CLEAN/app/results/inputs/all_cds_ref_pre_BEAUT_length_filtered_maxsep.csv'

# CLEAN output has 6 columns: Header + 5 Predictions
merged_ec_df = pd.read_csv(
    path_clean_maxsep, 
    names=['header', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
)
merged_ec_df.fillna('-', inplace=True)

# Apply eval_ec to each prediction column
for i in range(1, 6):
    merged_ec_df[f'pred_{i}'] = merged_ec_df[f'pred_{i}'].apply(eval_ec)

# Create a dictionary for fast mapping: {header: sorted_ec_list}
# range(2, 7) corresponds to t[2] through t[6] in itertuples (pred_1 to pred_5)
ec_predictions = {
    t[1]: sort_ecs([t[i][0] for i in range(2, 7) if t[i][0] != 'NONE']) 
    for t in merged_ec_df.itertuples()
}

# --- 2. Load and merge the 4 metadata parts (pt1 to pt4) ---
metadata_dfs = []
metadata_dir = '../data/all_cds_ref_pre_BEAUT_prefiltered_aug'

for i in range(1, 5):
    # Construct the path for positive_results_pt1.csv, pt2.csv, etc.
    pt_path = os.path.join(metadata_dir, f'positive_results_pt{i}.csv')
    if os.path.exists(pt_path):
        pt_df = pd.read_csv(pt_path)
        metadata_dfs.append(pt_df)
    else:
        print(f"Warning: {pt_path} not found.")

merged_df = pd.concat(metadata_dfs, ignore_index=True)

# --- 3. Assign CLEAN ECs to the merged metadata ---
# t[1] is the header/ID in your metadata csv
merged_df = merged_df.assign(
    clean_ec=[ec_predictions.get(t[1], []) for t in merged_df.itertuples()]
)

# --- 4. Save final output ---
output_final = '../data/all_cds_ref_pre_BEAUT_prefiltered_aug/all_cds_ref_pre_BEAUT_length_filtered_EC.csv'

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_final), exist_ok=True)

merged_df.to_csv(output_final, index=False)

print(f"Merge completed successfully.")
print(f"Total sequences in metadata: {len(merged_df)}")
print(f"Total sequences with CLEAN annotations: {len(ec_predictions)}")
print(f"Final file saved to: {output_final}")

python add_clean_predictions_zp.py

得到 all_cds_ref_pre_BEAUT_length_filtered_EC.csv

python 复制代码

import pickle
from ec_utils import sort_ecs_short, shorten_ec
import pandas as pd
from collections import Counter
import ast

def shorten_ec_list(t):
    return [shorten_ec(s) for s in t]

# 1. Load the KEGG name dictionary
ec2names = pickle.load(open('../data/kegg_ec2names.pkl', 'rb'))

# 2. Read your 4-column CSV file
input_path = '/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT/data/all_cds_ref_pre_BEAUT_prefiltered_aug/all_cds_ref_pre_BEAUT_length_filtered_EC.csv'
raw_df_aug = pd.read_csv(input_path)

# 3. Safely convert string representation of list to Python list
# It uses ast.literal_eval which is safer than eval()
raw_df_aug['clean_ec_short'] = raw_df_aug['clean_ec'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# 4. Correct outdated EC numbers
for i in raw_df_aug.index:
    short_ecs = raw_df_aug.at[i, 'clean_ec_short']
    if not isinstance(short_ecs, list):
        continue
    for j in range(len(short_ecs)):
        if short_ecs[j] == '3.3.1.1':
            short_ecs[j] = '3.13.2.1'
        elif short_ecs[j] == '3.3.1.2':
            short_ecs[j] = '3.13.2.2'
        elif short_ecs[j] == '3.3.1.3':
            short_ecs[j] = '3.2.1.148'

# 5. Collapse to 3-digit categories (e.g., 3.5.1.24 -> 3.5.1)
raw_df_aug['clean_ec_short'] = raw_df_aug['clean_ec_short'].apply(shorten_ec_list)

# 6. Optimized counting (replaces the slow nested loop and t[26])
print("Calculating global EC statistics...")
global_counter = Counter()
for ecs in raw_df_aug['clean_ec_short']:
    global_counter.update(ecs)

# Get sorted unique EC list
all_ecs = sort_ecs_short(list(global_counter.keys()))

# 7. Generate names from the dictionary
names = []
for ec in all_ecs:
    try:
        digits = ec.split('.')
        ec1 = digits[0]
        ec2 = '.'.join(digits[:2])
        # Fetching names with "Unknown" as fallback
        n1 = ec2names.get(ec1, "Unknown")
        n2 = ec2names.get(ec2, "Unknown")
        n3 = ec2names.get(ec, "Unknown")
        names.append(', '.join([n1, n2, n3]))
    except:
        names.append("Name mapping not found")

# 8. Create summary DataFrame and save
ec_stat_df_aug = pd.DataFrame({
    'EC': all_ecs, 
    'total': [global_counter[ec] for ec in all_ecs], 
    'name': names
})

output_path = '/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT/data/all_cds_ref_pre_BEAUT_prefiltered_aug/all_cds_ref_pre_BEAUT_length_filtered_EC_stat.csv'
ec_stat_df_aug.to_csv(output_path, index=False)

print(f"Success! Statistics saved to {output_path}")

python ec_stat_zp.py

得到 all_cds_ref_pre_BEAUT_length_filtered_EC_stat.csv

使用EFI-EST获取蛋白聚类

结果邮件会出来

python 复制代码

import pickle
import networkx as nx
from copy import deepcopy
import xml.parsers.expat
import time
import logging
logger = logging.getLogger('process_xgmml_graph')
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)


class XGMMLParserHelper(object):
    def __init__(self):
        self._graph = nx.DiGraph()
        self._parser = xml.parsers.expat.ParserCreate()
        self._parser.StartElementHandler = self._start_element
        self._parser.EndElementHandler = self._end_element
        self._tagstack = list()

        self._network_att_el = dict()
        self._current_att_el = dict()
        self._current_list_att_el = list()
        self._current_obj = dict()

    def _start_element(self, tag, attr):
        self._tagstack.append(tag)

        if tag == 'graph':
            self._network_att_el = dict()

        if tag == 'node' or tag == 'edge':
            self._current_obj = dict(attr)

        if tag == 'att' and (self._tagstack[-2] == 'node' or
                             self._tagstack[-2] == 'edge'):
            if 'value' in attr:
                self._current_att_el = self._parse_att_el(self._current_att_el,
                                                          tag, attr)
            elif attr['type'] == 'list':
                self._current_list_name = attr['name']
                self._current_att_el[attr['name']] = list()

        if tag == 'att' and (self._tagstack[-2] == 'att'):
            self._current_list_att_el = dict(attr)
            if 'value' in attr:
                self._current_list_att_el = self._parse_att_el(
                    self._current_list_att_el, tag, attr)
                self._current_att_el[self._current_list_name].append(
                    self._current_list_att_el[attr['name']])

        if tag == 'att' and self._tagstack[-2] == 'graph':
            if 'value' in attr:
                self._network_att_el[attr['name']] = attr['value']

    def _parse_att_el(self, att_el, tag, attr):
        if 'value' in attr:
            if attr['type'] == 'string':
                att_el[attr['name']] = attr['value']
            elif attr['type'] == 'real':
                att_el[attr['name']] = float(attr['value'])
            elif attr['type'] == 'integer':
                att_el[attr['name']] = int(attr['value'])
            elif attr['type'] == 'boolean':
                att_el[attr['name']] = bool(attr['value'])
            else:
                raise NotImplementedError(attr['type'])

            return att_el

    def _end_element(self, tag):
        if tag == 'node':
            for k in remove_node_keys:
                if k in self._current_att_el:
                    del self._current_att_el[k]
            for k in keep_node_keys_repl:
                if k in self._current_att_el:
                    self._current_att_el[keep_node_keys_repl[k]] = deepcopy(self._current_att_el[k])
                    del self._current_att_el[k]
            if 'label' in self._current_obj:
                if 'label' in self._current_att_el:
                    self._current_att_el['@label'] = self._current_att_el['label']
                    del self._current_att_el['label']

                self._graph.add_node(self._current_obj['id'],
                                     label=self._current_obj['label'],
                                     **self._current_att_el)
            else:
                self._graph.add_node(self._current_obj['id'],
                                     **self._current_att_el)
            self._current_att_el = dict()
        elif tag == 'edge':
            self._current_att_el['fident'] = self._current_att_el['%id']
            del self._current_att_el['%id']
            self._graph.add_edge(self._current_obj['source'],
                                 self._current_obj['target'],
                                 **self._current_att_el)
            self._current_att_el = dict()

        self._tagstack.pop()

    def parseFile(self, file):
        self._parser.ParseFile(file)

    def graph(self):
        return self._graph

    def graph_attributes(self):
        return self._network_att_el


def XGMMLReader(graph_file):
    parser = XGMMLParserHelper()
    parser.parseFile(graph_file)
    return parser.graph()


ssn_name = 'alnscore60_full'
keep_node_keys_repl = {
    'Taxonomy ID': 'taxonomy_id',
    'UniProt Annotation Status': 'uniprot_annotation_status',
    'SwissProt Description': 'swissprot_description',
    'Sequence Length': 'length',
    'Sequence Status': 'sequence_status',
    'Number of IDs in Rep Node': 'num_members'
}
remove_node_keys = ['Sequence Source', 'Other IDs', 'Gene Name', 'NCBI IDs', 'List of IDs in Rep Node', 'Superkingdom', 'Kingdom', 'Phylum', 'Class',
                    'Order', 'Family',
                    'Genus', 'Species', 'PDB', 'TIGRFAMs', 'InterPro (Domain)', 'InterPro (Family)', 'InterPro (Homologous Superfamily)',
                    'InterPro (Other)', 'BRENDA ID', 'Cazy Name', 'GO Term', 'KEGG ID', 'PATRIC ID', 'STRING ID', 'HMP Body Site', 'HMP Oxygen',
                    'P01 gDNA', 'Rhea', 'AlphaFold', 'Sequence']

start_time = time.monotonic()
graph = XGMMLReader(
    open(f'/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT/data/all_cds_ref_pre_BEAUT_full_ssn.xgmml',
         'rb')
)
end_time = time.monotonic()
logger.info(f'{round(end_time - start_time, 0)}s used for loading the graph.')
start_time = time.monotonic()
graph = graph.to_undirected()
end_time = time.monotonic()
logger.info(f'{round(end_time - start_time, 0)}s used for transforming the graph to undirected.')

logger.info(f'There are {graph.number_of_nodes()} nodes in the graph.')
logger.info(f'There are {graph.number_of_edges()} egdes in the graph.')
n_connected_components = len(list(nx.connected_components(graph)))
logger.info(f'There are {n_connected_components} clusters in the graph.')
cluster_sizes = [len(c) for c in nx.connected_components(graph)]
logger.info(f'Largest cluster: {max(cluster_sizes)}')
logger.info(f'There are {sum([n >= 3 for n in cluster_sizes])} clusters with >= 3 sequences.')
logger.info(f'There are {sum([n >= 5 for n in cluster_sizes])} clusters with >= 5 sequences.')
logger.info(f'There are {sum([n >= 10 for n in cluster_sizes])} clusters with >= 10 sequences.')
logger.info(f'There are {sum([n >= 50 for n in cluster_sizes])} clusters with >= 50 sequences.')
logger.info(f'There are {sum([n >= 100 for n in cluster_sizes])} clusters with >= 100 sequences.')
logger.info(f'There are {sum([n >= 500 for n in cluster_sizes])} clusters with >= 500 sequences.')
logger.info(f'There are {sum([n >= 1000 for n in cluster_sizes])} clusters with >= 1000 sequences.')
logger.info(f'There are {sum([n == 1 for n in cluster_sizes])} singletons.')

conn_comps = list(nx.connected_components(graph))
conn_comps.sort(key=lambda t: len(t), reverse=True)
conn_comps_to_save = []
for comp in conn_comps:
    conn_comps_to_save.append([])
    for n in comp:
        conn_comps_to_save[-1].extend(graph.nodes[n]['Description'])
with open(f'/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT/data/all_cds_ref_pre_BEAUT_prefiltered_aug/all_cds_ref_pre_BEAUT_prefiltered_{ssn_name}_clusters.pkl', 'wb') as f:
    pickle.dump(conn_comps_to_save, f)
f.close()

python process_xgmml_graph_zp.py

得到 all_cds_ref_pre_BEAUT_prefiltered_alnscore60_full_clusters.pkl

python 复制代码

import pickle
import pandas as pd
from ec_utils import shorten_ec
from collections import defaultdict

df = pd.read_csv('/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT/data/all_cds_ref_pre_BEAUT_prefiltered_aug/all_cds_ref_pre_BEAUT_length_filtered_EC.csv')
df.set_index('header', inplace=True)
clu_60 = pickle.load(open('/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT/data/all_cds_ref_pre_BEAUT_prefiltered_aug/all_cds_ref_pre_BEAUT_prefiltered_alnscore60_full_clusters.pkl', 'rb'))

cluster_info = pd.DataFrame({'cluster_id':
                                 [f'cluster_{i}' for i in range(1, len(clu_60) + 1)],
                             'size':
                                 [len(t) for t in clu_60]
                             })
cluster_ec_info = []
for indexes in clu_60:
    qdf = df.loc[indexes].copy().sort_values('pred_score', ascending=False)
    ec_dict = defaultdict(int)
    for t in qdf.itertuples():
        this_ecs = [shorten_ec(s) for s in eval(t[-1])]
        this_ecs = set(this_ecs)
        for k in this_ecs:
            ec_dict[k] += 1
    ec_info = ''
    most_common_ecs = list(ec_dict.items())
    most_common_ecs.sort(key=lambda x: x[1], reverse=True)
    most_common_ecs = most_common_ecs[:3]
    for t in most_common_ecs:
        ec_info += f'EC:{t[0]}, {round(t[1] / len(qdf) * 100, 1)}%; '
    cluster_ec_info.append(ec_info[:-2])

top1_ec = []
top1_ec_ratio = []
for s in cluster_ec_info:
    if s == 'NONE':
        top1_ec.append('-')
        top1_ec_ratio.append('-')
    else:
        try:
            top1_ec.append(s.split('; ')[0].split(',')[0])
            top1_ec_ratio.append(s.split('; ')[0].split(', ')[1])
        except:
            print(s)
            raise

top2_ec = []
top2_ec_ratio = []
for s in cluster_ec_info:
    if s == 'NONE':
        top2_ec.append('-')
        top2_ec_ratio.append('-')
    else:
        strings = s.split('; ')
        if len(strings) == 1:
            top2_ec.append('-')
            top2_ec_ratio.append('-')
        else:
            top2_ec.append(strings[1].split(', ')[0])
            top2_ec_ratio.append(strings[1].split(', ')[1])

top3_ec = []
top3_ec_ratio = []
for s in cluster_ec_info:
    if s == 'NONE':
        top3_ec.append('-')
        top3_ec_ratio.append('-')
    else:
        strings = s.split('; ')
        if len(strings) < 3:
            top3_ec.append('-')
            top3_ec_ratio.append('-')
        else:
            top3_ec.append(strings[2].split(', ')[0])
            top3_ec_ratio.append(strings[2].split(', ')[1])

cluster_info = cluster_info.assign(top1_ec=top1_ec, top1_ec_ratio=top1_ec_ratio,
                                   top2_ec=top2_ec, top2_ec_ratio=top2_ec_ratio, top3_ec=top3_ec, top3_ec_ratio=top3_ec_ratio)
cluster_info.to_csv('/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT/data/all_cds_ref_pre_BEAUT_prefiltered_aug/cluster_info.csv',
                    index=False)

header2clu = {}
for i, t in enumerate(clu_60):
    for h in t:
        header2clu[h] = i + 1
headers = []
for t in clu_60:
    headers.extend(t)

qdf = df.loc[headers].copy()
qdf['cluster'] = [header2clu[t[0]] for t in qdf.itertuples()]
qdf.sort_values(by=['cluster', 'pred_score'], ascending=[True, False], inplace=True)
qdf['cluster'] = qdf['cluster'].astype(str)
for i in qdf.index:
    qdf.at[i, 'cluster'] = f"cluster_{qdf.at[i, 'cluster']}"
cols = qdf.columns.values.tolist()
qdf = qdf[cols[:2] + ['cluster'] + cols[2:-1]]
qdf.reset_index(inplace=True)
qdf.to_csv('/home/zhongpei/hard_disk_sda2/zhongpei/Software/BEAUT/data/all_cds_ref_pre_BEAUT_prefiltered_aug/cluster_results.csv',
           index=False)

python process_ssn_clusters_zp.py

得到 cluster_results.csv 和 cluster_info.csv

BEAUT：胆汁酸酶注释

安装

运行

准备 BEAUT/scripts/filter_non_enzymes_3.0.py

把结果fasta文件放到 BEAUT/esm 文件夹内，并去除*号

使用esm-2进行氨基酸序列预处理

进行esm-2结果合并

CLEAN 安装

使用EFI-EST获取蛋白聚类

把结果fasta文件放到 `BEAUT/esm 文件夹内，并去除*号`