EukDetect:基因标记基因的真核微生物注释

Challenges in capturing the mycobiome from shotgun metagenome data: lack of software and databases | Microbiome | Full Text评价EukDetect比较准确

安装

https://github.com/allind/EukDetect

复制代码
cd Software
git clone https://github.com/allind/EukDetect.git
cd EukDetect
# 下载 https://figshare.com/articles/dataset/Eukdetect_database/12670856/8?file=34880610
tar -xzvf eukdetect_database_v2.tar.gz 

conda env update --name eukdetect -f environment.yml
conda activate eukdetect
# install eukdetect

python setup.py install
# 要测试您的安装,请编辑文件 tests/configfile_for_tests.yml ,输入安装目录的路径和 EukDetect 数据库的路径。
python tests/test_eukdetect.py

使用

bash 复制代码
#将 default_configfile.yml 复制到新建的 your_configfile.yml 。按照描述修改配置文件中的所有参数。
# gzip -dc test.fastq.gz | head -n 10000 | awk '{ if (NR%4==2){count++; bases += length}} END{printf "%3.0f\n", bases/count}' #可以用于确定readlen填多少长度

configfile.yml文件

bash 复制代码
#Default config file for eukdetect. Copy and edit for analysis

#Directory where EukDetect output should be written
output_dir: "/home/zhongpei/diarrhoea/xjs_FJ_metagenomic/drep_bin/all_bin/fungi/eukdetect/"
  
#Indicate whether reads are paired (true) or single (false)
paired_end: true 

#filename excluding sample name. no need to edit if paired_end = false
fwd_suffix: "_clean_1.fastq.gz" 

#filename excludign sample name. no need to edit if paired_end = false
rev_suffix: "_clean_2.fastq.gz"

#file name excluding sample name. no need to edit if paired_end = true 
se_suffix: ".fastq.gz" 

#length of your reads. pre-trimming reads not recommended
readlen: 150

#full path to directory with raw fastq files
fq_dir: "/home/zhongpei/diarrhoea/xjs_FJ_metagenomic/metaMIC_contigs"

#full path to folder with eukdetect database files
database_dir: "/home/zhongpei/hard_disk_sda2/zhongpei/Software/EukDetect/database/"

#name of database. Default is original genomes only database name
database_prefix: "ncbi_eukprot_met_arch_markers.fna"

#full path to eukdetect installation folder
eukdetect_dir: "/home/zhongpei/hard_disk_sda2/zhongpei/Software/EukDetect"

#list sample names here. fastqs must correspond to {samplename}{se_suffix} for SE reads or {samplename}{fwd_suffix} and {samplename}{rev_suffix} for PE
#each sample name should be preceded by 2 spaces and followed by a colon character
samples:
  F1:
  F2:
  F3:
  F4:
  F5:
  F6:
  F7:
  F8:
  F9:
  F10:
  F11:
  F12:
  F13:
  F14:
  F15:
  F16:
  F17:
  F18:
  F19:
  F20:
  F21:
  F22:
  F23:
  F24:
  F25:
  F26:
  F27:
  F28:
  F29:
  F30:
  F31:
  F32:
  F33:
  F34:
  F35:
  F36:
  F37:
  F38:
  F39:
  F40:
  F41:
  F42:
  F43:
  F44:
  F45:
  F46:
  F47:
  F48:
  F49:
  F50:
  J1:
  J2:
  J3:
  J4:
  J5:
  J6:
  J7:
  J8:
  J9:
  J10:
  J11:
  J12:
  J13:
  J14:
  J15:
  J16:
  J17:
  J18:
  J19:
  J20:
  J21:
  J22:
  J23:
  J24:
  J25:
  J26:
  J27:
  J28:
  J29:
  J30:
  J31:
  J32:
  J33:
  J34:
  J35:
  J36:
  J37:
  J38:
  J39:
  J40:
  J41:
  J42:
  J43:
  J44:
  J45:
  J46:
  J47:
  J48:
  J49:
  J50:

正式运行

bash 复制代码
eukdetect --mode runall --configfile ~/your_configfile.yml --cores 32

结果文件

*_filtered_hits_table.txt 是主要的结果文件

统计

python 复制代码
#! /usr/bin/env python
#########################################################
# Simplified microbial analysis - generates presence/absence matrix only
# Columns: sample names, Rows: species, Cell: present=1, absent=0

import argparse
import os
import pandas as pd

def clean_percentage_columns(df):
    """Clean percentage columns"""
    percentage_columns = ['Percent_observed_markers', 'Total_marker_coverage', 'Percent_identity']
    for col in percentage_columns:
        if col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].astype(str).str.replace('%', '').astype(float)
    return df

# Argument parsing
parser = argparse.ArgumentParser(description='Generate microbial presence/absence matrix')
parser.add_argument('--work_path', '-p', help='Path containing result files')
parser.add_argument('--file_maker', '-m', nargs='+', help='File identifiers (separated by space)')
parser.add_argument('--output_name', '-o', help='Output filename prefix')
parser.add_argument('--percent_threshold', '-t', type=float, default=50.0,
                    help='Threshold for Percent_observed_markers (default: 50.0)')

args = parser.parse_args()

# Set working directory
os.chdir(args.work_path)
files = os.listdir(args.work_path)

# Filter matched files
ls = []
for file in files:
    if all(maker in file for maker in args.file_maker):
        ls.append(file)
ls.sort()

print(f"Found {len(ls)} matching files:")
for f in ls:
    print(f"  {f}")

if not ls:
    print("No matching files found!")
    exit(1)

# Collect all microbes passing threshold
all_microbes = set()
sample_microbe_dict = {}

print(f"\nProcessing files (threshold: >= {args.percent_threshold}%)...")

for file_name in ls:
    try:
        # Check file content
        with open(file_name, 'r') as f:
            first_line = f.readline().strip()
            
        if "No taxa passing filter requirements" in first_line or "No taxa" in first_line:
            print(f"  {file_name}: No valid data - Skipped")
            sample_microbe_dict[file_name] = set()
            continue
            
        # Read table
        df = pd.read_csv(file_name, sep='\t')
        
        if 'Name' not in df.columns or 'Percent_observed_markers' not in df.columns:
            print(f"  {file_name}: Missing required columns - Skipped")
            sample_microbe_dict[file_name] = set()
            continue
            
        if len(df) == 0:
            print(f"  {file_name}: No data rows - Skipped")
            sample_microbe_dict[file_name] = set()
            continue
            
        # Clean %
        df = clean_percentage_columns(df)
        
        # Filter by threshold
        filtered_df = df[df['Percent_observed_markers'] >= args.percent_threshold]
        microbes_in_sample = set(filtered_df['Name'].tolist())
        
        sample_microbe_dict[file_name] = microbes_in_sample
        all_microbes.update(microbes_in_sample)
        
        print(f"  {file_name}: {len(df)} records, {len(microbes_in_sample)} passed threshold")
        
    except Exception as e:
        print(f"  {file_name}: Error occurred - {e} - Skipped")
        sample_microbe_dict[file_name] = set()
        continue

# Ensure all files have dictionary entry
for file_name in ls:
    if file_name not in sample_microbe_dict:
        sample_microbe_dict[file_name] = set()

print(f"\nTotal: {len(all_microbes)} unique microbes passed threshold")

if len(all_microbes) == 0:
    print("No microbes passed the threshold!")
    exit(1)

# Create presence/absence matrix
microbe_list = sorted(list(all_microbes))
sample_list = sorted(ls)

print(f"\nCreating matrix of size {len(microbe_list)} x {len(sample_list)}...")

matrix_data = []
for microbe in microbe_list:
    row = [microbe]
    for sample in sample_list:
        row.append(1 if microbe in sample_microbe_dict[sample] else 0)
    matrix_data.append(row)

columns = ['Microbe'] + sample_list
matrix_df = pd.DataFrame(matrix_data, columns=columns)

output_file = f'{args.output_name}_presence_matrix.txt'
matrix_df.to_csv(output_file, sep='\t', index=False)

print(f"\nMatrix saved to: {output_file}")
print(f"Matrix dimensions: {len(microbe_list)} microbes x {len(sample_list)} samples")
print(f"Matrix key: 1 = present and passed threshold, 0 = absent or below threshold")

# Show preview
print(f"\nMatrix preview (first 5 rows and columns):")
preview = matrix_df.iloc[:5, :6] if len(matrix_df.columns) > 6 else matrix_df.head()
print(preview.to_string(index=False))

print("\nAnalysis completed!")
相关推荐
白鲸开源3 小时前
(二)从分层架构到数据湖仓架构:数据仓库分层下的技术架构与举例
大数据·数据库·数据分析
好玩的Matlab(NCEPU)3 小时前
Redis vs RabbitMQ 对比总结
数据库·redis·rabbitmq
21号 13 小时前
16.MySQL 服务器配置与管理
服务器·数据库·mysql
我的offer在哪里4 小时前
MongoDB
数据库·mongodb
练习时长一年5 小时前
AI开发结构化输出
数据库
IvorySQL5 小时前
灾难恢复工具内核细节探究与分享
数据库·postgresql·开源
lypzcgf5 小时前
商城小程序数据库表结构文档
数据库·小程序·电商
jjw_zyfx6 小时前
Ubuntu上vue3 vite使用MBTiles搭建地图服务器
服务器·数据库·ubuntu
EndingCoder6 小时前
Node.js SQL数据库:MySQL/PostgreSQL集成
javascript·数据库·sql·mysql·postgresql·node.js
静听山水7 小时前
SQLite
数据库·sqlite