Python 数据可视化 boxplot
python
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# 读取 TSV 文件
df = pd.read_csv('result.tsv', sep='\t')
normal_df = df[df["sample_name"].str.contains("normal")]
tumor_df = df[df["sample_name"].str.contains("tumor")]
# sns.boxplot(x='down_level', y='loci_median_depth', data=tumor_df)
# sns.boxplot(x='down_level', y='loci_average_depth', data=tumor_df)
def box_plot_1(df):
# 提取数据
sample_data = {}
# 遍历 DataFrame,根据样本和级别组织数据
for index, row in df.iterrows():
sample = row['sample_name']
level = row['down_level']
data = row['loci_median_depth'] # 这里假设你想绘制 loci_median_depth 列的箱线图
# 如果样本不在字典中,则将其添加为新的键,并将数据存储为列表
if sample not in sample_data:
sample_data[sample] = {}
if level not in sample_data[sample]:
sample_data[sample][level] = []
sample_data[sample][level].append(data)
# 绘制箱线图
plt.figure(figsize=(100, 60))
# 遍历样本和级别,绘制箱线图
position = 1
for sample, levels in sample_data.items():
for level, data in levels.items():
label = f"{sample} - {level}"
plt.boxplot(data, positions=[position], labels=[label])
position += 1
plt.ylabel('Depth')
plt.title('Box Plot of Depth Data by Sample and Level')
plt.grid(True)
plt.xticks(rotation=45)
# 保存箱线图为文件
plt.savefig('boxplot.png')
plt.show()
def box_plot_2(df, target_header_list):
lvl_list = ["ori", '40', '36', '32', '28', '24', '20']
for level_to_plot in lvl_list:
filtered_df = df[df['down_level'] == level_to_plot]
plt.figure(figsize=(20, 15))
# plt.boxplot(filtered_df[' loci_median_depth'])
plt.boxplot([filtered_df[i] for i in target_header_list], labels=target_header_list)
plt.ylabel('Depth')
# plt.xlabel(level_to_plot)
plt.title(f'Box Plot of Depth Data for {level_to_plot} Level')
plt.grid(True)
plt.xticks(rotation=45)
# 在箱线图上绘制每个数据点
for i, col in enumerate(target_header_list):
x = [i + 1] * len(filtered_df[col])
plt.plot(x, filtered_df[col], 'ro', alpha=0.5)
# 保存箱线图为文件
plt.savefig(f'boxplot_{level_to_plot}.png')
# 显示箱线图
plt.show()
def box_plot_3(df, target_header_list):
# 选择要包含在 y 轴中的列
y_columns = target_header_list
# 将这些列数据整合到一个单独的 DataFrame 中
y_data = df[y_columns]
# 使用 pd.melt() 函数将其转换为适合绘制箱线图的格式
melted_df = pd.melt(df, id_vars=['down_level'], value_vars=y_columns, var_name='Depth_Type', value_name='Depth')
# 使用 seaborn 绘制箱线图
plt.figure(figsize=(12, 8))
sns.boxplot(x='down_level', y='Depth', hue='Depth_Type', data=melted_df, dodge=True)
plt.xlabel('Down Level (G)')
plt.ylabel('Depth')
plt.title('Box Plot of Depth Data by Down Level')
plt.legend(title='Depth Type', loc='upper right')
plt.grid(True)
plt.savefig(f'boxplot.png')
print()
lvl_list = ["ori", '40', '36', '32', '28', '24', '20']
target_header_list = ["loci_median_depth", "loci_average_depth", "dedup_loci_median_depth", "dedup_loci_average_depth", "average_depth", "median_depth", "dedup_average_depth", "dedup_median_depth"]
# box_plot(tumor_df, target_header_list)
# box_plot_3(normal_df, target_header_list)
# box_plot_2(normal_df, target_header_list)
box_plot_3(normal_df, target_header_list)
# box_plot_1(normal_df)
box_plot_2
box_plot_3
参考: