EukDetect：基因标记基因的真核微生物注释

Challenges in capturing the mycobiome from shotgun metagenome data: lack of software and databases | Microbiome | Full Text评价EukDetect比较准确

安装

https://github.com/allind/EukDetect

复制代码

cd Software
git clone https://github.com/allind/EukDetect.git
cd EukDetect
# 下载 https://figshare.com/articles/dataset/Eukdetect_database/12670856/8?file=34880610
tar -xzvf eukdetect_database_v2.tar.gz 

conda env update --name eukdetect -f environment.yml
conda activate eukdetect
# install eukdetect

python setup.py install
# 要测试您的安装，请编辑文件 tests/configfile_for_tests.yml ，输入安装目录的路径和 EukDetect 数据库的路径。
python tests/test_eukdetect.py

使用

bash 复制代码

#将 default_configfile.yml 复制到新建的 your_configfile.yml 。按照描述修改配置文件中的所有参数。
# gzip -dc test.fastq.gz | head -n 10000 | awk '{ if (NR%4==2){count++; bases += length}} END{printf "%3.0f\n", bases/count}' #可以用于确定readlen填多少长度

configfile.yml文件

bash 复制代码

#Default config file for eukdetect. Copy and edit for analysis

#Directory where EukDetect output should be written
output_dir: "/home/zhongpei/diarrhoea/xjs_FJ_metagenomic/drep_bin/all_bin/fungi/eukdetect/"
  
#Indicate whether reads are paired (true) or single (false)
paired_end: true 

#filename excluding sample name. no need to edit if paired_end = false
fwd_suffix: "_clean_1.fastq.gz" 

#filename excludign sample name. no need to edit if paired_end = false
rev_suffix: "_clean_2.fastq.gz"

#file name excluding sample name. no need to edit if paired_end = true 
se_suffix: ".fastq.gz" 

#length of your reads. pre-trimming reads not recommended
readlen: 150

#full path to directory with raw fastq files
fq_dir: "/home/zhongpei/diarrhoea/xjs_FJ_metagenomic/metaMIC_contigs"

#full path to folder with eukdetect database files
database_dir: "/home/zhongpei/hard_disk_sda2/zhongpei/Software/EukDetect/database/"

#name of database. Default is original genomes only database name
database_prefix: "ncbi_eukprot_met_arch_markers.fna"

#full path to eukdetect installation folder
eukdetect_dir: "/home/zhongpei/hard_disk_sda2/zhongpei/Software/EukDetect"

#list sample names here. fastqs must correspond to {samplename}{se_suffix} for SE reads or {samplename}{fwd_suffix} and {samplename}{rev_suffix} for PE
#each sample name should be preceded by 2 spaces and followed by a colon character
samples:
  F1:
  F2:
  F3:
  F4:
  F5:
  F6:
  F7:
  F8:
  F9:
  F10:
  F11:
  F12:
  F13:
  F14:
  F15:
  F16:
  F17:
  F18:
  F19:
  F20:
  F21:
  F22:
  F23:
  F24:
  F25:
  F26:
  F27:
  F28:
  F29:
  F30:
  F31:
  F32:
  F33:
  F34:
  F35:
  F36:
  F37:
  F38:
  F39:
  F40:
  F41:
  F42:
  F43:
  F44:
  F45:
  F46:
  F47:
  F48:
  F49:
  F50:
  J1:
  J2:
  J3:
  J4:
  J5:
  J6:
  J7:
  J8:
  J9:
  J10:
  J11:
  J12:
  J13:
  J14:
  J15:
  J16:
  J17:
  J18:
  J19:
  J20:
  J21:
  J22:
  J23:
  J24:
  J25:
  J26:
  J27:
  J28:
  J29:
  J30:
  J31:
  J32:
  J33:
  J34:
  J35:
  J36:
  J37:
  J38:
  J39:
  J40:
  J41:
  J42:
  J43:
  J44:
  J45:
  J46:
  J47:
  J48:
  J49:
  J50:

正式运行

bash 复制代码

eukdetect --mode runall --configfile ~/your_configfile.yml --cores 32

结果文件

*_filtered_hits_table.txt 是主要的结果文件

统计

python 复制代码

#! /usr/bin/env python
#########################################################
# Simplified microbial analysis - generates presence/absence matrix only
# Columns: sample names, Rows: species, Cell: present=1, absent=0

import argparse
import os
import pandas as pd

def clean_percentage_columns(df):
    """Clean percentage columns"""
    percentage_columns = ['Percent_observed_markers', 'Total_marker_coverage', 'Percent_identity']
    for col in percentage_columns:
        if col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].astype(str).str.replace('%', '').astype(float)
    return df

# Argument parsing
parser = argparse.ArgumentParser(description='Generate microbial presence/absence matrix')
parser.add_argument('--work_path', '-p', help='Path containing result files')
parser.add_argument('--file_maker', '-m', nargs='+', help='File identifiers (separated by space)')
parser.add_argument('--output_name', '-o', help='Output filename prefix')
parser.add_argument('--percent_threshold', '-t', type=float, default=50.0,
                    help='Threshold for Percent_observed_markers (default: 50.0)')

args = parser.parse_args()

# Set working directory
os.chdir(args.work_path)
files = os.listdir(args.work_path)

# Filter matched files
ls = []
for file in files:
    if all(maker in file for maker in args.file_maker):
        ls.append(file)
ls.sort()

print(f"Found {len(ls)} matching files:")
for f in ls:
    print(f"  {f}")

if not ls:
    print("No matching files found!")
    exit(1)

# Collect all microbes passing threshold
all_microbes = set()
sample_microbe_dict = {}

print(f"\nProcessing files (threshold: >= {args.percent_threshold}%)...")

for file_name in ls:
    try:
        # Check file content
        with open(file_name, 'r') as f:
            first_line = f.readline().strip()
            
        if "No taxa passing filter requirements" in first_line or "No taxa" in first_line:
            print(f"  {file_name}: No valid data - Skipped")
            sample_microbe_dict[file_name] = set()
            continue
            
        # Read table
        df = pd.read_csv(file_name, sep='\t')
        
        if 'Name' not in df.columns or 'Percent_observed_markers' not in df.columns:
            print(f"  {file_name}: Missing required columns - Skipped")
            sample_microbe_dict[file_name] = set()
            continue
            
        if len(df) == 0:
            print(f"  {file_name}: No data rows - Skipped")
            sample_microbe_dict[file_name] = set()
            continue
            
        # Clean %
        df = clean_percentage_columns(df)
        
        # Filter by threshold
        filtered_df = df[df['Percent_observed_markers'] >= args.percent_threshold]
        microbes_in_sample = set(filtered_df['Name'].tolist())
        
        sample_microbe_dict[file_name] = microbes_in_sample
        all_microbes.update(microbes_in_sample)
        
        print(f"  {file_name}: {len(df)} records, {len(microbes_in_sample)} passed threshold")
        
    except Exception as e:
        print(f"  {file_name}: Error occurred - {e} - Skipped")
        sample_microbe_dict[file_name] = set()
        continue

# Ensure all files have dictionary entry
for file_name in ls:
    if file_name not in sample_microbe_dict:
        sample_microbe_dict[file_name] = set()

print(f"\nTotal: {len(all_microbes)} unique microbes passed threshold")

if len(all_microbes) == 0:
    print("No microbes passed the threshold!")
    exit(1)

# Create presence/absence matrix
microbe_list = sorted(list(all_microbes))
sample_list = sorted(ls)

print(f"\nCreating matrix of size {len(microbe_list)} x {len(sample_list)}...")

matrix_data = []
for microbe in microbe_list:
    row = [microbe]
    for sample in sample_list:
        row.append(1 if microbe in sample_microbe_dict[sample] else 0)
    matrix_data.append(row)

columns = ['Microbe'] + sample_list
matrix_df = pd.DataFrame(matrix_data, columns=columns)

output_file = f'{args.output_name}_presence_matrix.txt'
matrix_df.to_csv(output_file, sep='\t', index=False)

print(f"\nMatrix saved to: {output_file}")
print(f"Matrix dimensions: {len(microbe_list)} microbes x {len(sample_list)} samples")
print(f"Matrix key: 1 = present and passed threshold, 0 = absent or below threshold")

# Show preview
print(f"\nMatrix preview (first 5 rows and columns):")
preview = matrix_df.iloc[:5, :6] if len(matrix_df.columns) > 6 else matrix_df.head()
print(preview.to_string(index=False))

print("\nAnalysis completed!")