项目简介
EasyDB 是一个基于 Rust + Tauri 的轻量级桌面 SQL 查询工具。用户可以直接用 SQL 查询本地文件(CSV、Excel、JSON、Parquet 等),无需安装数据库。
本文将深入探讨其技术架构、关键设计决策和实现细节。
一、整体架构
sql
┌─────────────────────────────────────────────────────┐
│ 前端层 │
│ React 18 + TypeScript + Vite │
│ HeroUI + Tailwind CSS │
│ @tanstack/react-virtual (虚拟滚动) │
│ Ace Editor (SQL 编辑器) │
├─────────────────────────────────────────────────────┤
│ Tauri v2 桥接层 │
│ IPC 调用: fetch, fetch_page, writer, etc. │
├─────────────────────────────────────────────────────┤
│ Rust 后端 │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ SQL Parse │ │ Context │ │ Readers │ │
│ │ sqlparser │→ │ DataFusion │→ │ CSV/Excel │ │
│ │ EasyDBDialect│ │ SessionCtx │ │ MySQL/PG │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ SQL Export │ │ Storage │ │
│ │ Generator │ │ rusqlite │ │
│ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────┘
二、SQL 解析与 AST 转换
2.1 为什么需要 AST 转换?
DataFusion 支持标准的 SQL,但它不认识 read_csv('/path') 这种表函数语法。EasyDB 需要把用户的 SQL:
sql
SELECT * FROM read_csv('/data/file.csv') WHERE id = 1;
转换成 DataFusion 能理解的:
sql
SELECT * FROM table0 WHERE id = 1;
同时把 read_csv('/data/file.csv') 注册为 table0。
2.2 自定义 SQL 方言
rust
#[derive(Debug)]
pub struct EasyDBDialect;
impl Dialect for EasyDBDialect {
fn is_identifier_start(&self, ch: char) -> bool {
GenericDialect.is_identifier_start(ch)
}
fn is_identifier_part(&self, ch: char) -> bool {
GenericDialect.is_identifier_part(ch)
}
fn supports_named_fn_args_with_expr_name(&self) -> bool {
false // 关闭,简化解析
}
}
2.3 递归 AST 遍历
SQL 中可能有子查询、JOIN 等嵌套结构,需要递归处理:
rust
#[async_recursion]
pub async fn convert_table_name(
ctx: &mut SessionContext,
query: &mut Box<Query>,
mut table_count: i32,
) -> AppResult<i32> {
if let Select(select) = &mut *query.body {
for table_with_joins in &mut select.from {
match &mut table_with_joins.relation {
TableFactor::Derived { subquery, .. } => {
// 递归处理子查询
table_count = convert_table_name(ctx, subquery, table_count).await?;
}
relation => {
table_count = register_table(ctx, relation, table_count).await?;
}
}
// JOIN 子句也要递归处理
for join in &mut table_with_joins.joins {
match &mut join.relation {
TableFactor::Derived { subquery, .. } => {
table_count = convert_table_name(ctx, subquery, table_count).await?;
}
relation => {
table_count = register_table(ctx, &mut join.relation, table_count).await?;
}
}
}
}
}
Ok(table_count)
}
2.4 表函数注册
rust
pub async fn register_table(
ctx: &mut SessionContext,
relation: &mut TableFactor,
table_count: i32,
) -> AppResult<i32> {
if let TableFactor::Table { name, args, .. } = relation {
let table_name = format!("table{}", table_count);
let table_path = get_table_path(args)?;
let reader_name = name.to_string();
match reader_name.as_str() {
"read_csv" => {
ctx.register_csv(&table_name, &table_path,
get_csv_read_options(args, CsvReadOptions::default())?)
.await?
}
"read_excel" | "read_xlsx" => {
ctx.register_batch(&table_name,
read_excel(ExcelReader::new(table_path), args)?)?;
}
"read_mysql" => {
register_mysql(ctx, &table_name, &table_path, args).await?;
}
"read_postgres" => {
register_postgres(ctx, &table_name, &table_path, args).await?;
}
// ...
}
*name = sqlparser::ast::ObjectName(vec![table_name.as_str().into()]);
*args = None;
}
Ok(table_count + 1)
}
三、Excel 读取:从零实现 Arrow RecordBatch
DataFusion 原生不支持 Excel,我基于 calamine 实现了一个完整的 Excel → Arrow 转换器。
3.1 Builder 模式设计
rust
pub struct ExcelReader {
path: String,
sheet_name: Option<String>,
infer_schema_length: usize, // 默认 100
try_parse_dates: bool,
}
impl ExcelReader {
pub fn new(path: String) -> Self { /* ... */ }
pub fn with_sheet_name(mut self, sheet_name: String) -> Self { /* ... */ }
pub fn with_infer_schema_length(mut self, infer_schema_length: usize) -> Self { /* ... */ }
pub fn finish(self) -> AppResult<RecordBatch> { /* ... */ }
}
3.2 Schema 推断
rust
pub fn infer_field_schema(range: &Range<Data>, infer_schema_length: usize) -> AppResult<Schema> {
let headers: Vec<String> = if range.headers().is_none() {
// 无表头时自动生成 t1, t2, t3...
rows.iter().enumerate().map(|(i, _)| format!("t{}", i + 1)).collect()
} else {
range.headers().unwrap().iter().map(|h| h.to_string()).collect()
};
let mut data_types: Vec<HashSet<DataType>> = vec![HashSet::new(); num_columns];
for row in range.rows().take(infer_schema_length) {
for (i, cell) in row.iter().enumerate() {
if i < num_columns && !matches!(cell, Data::Empty) {
data_types[i].insert(infer_cell_data_type(cell));
}
}
}
// 类型优先级: Int64 > Float64 > Timestamp > Utf8
let fields: Vec<Field> = data_types.iter().enumerate().map(|(i, types)| {
let data_type = if types.is_empty() {
DataType::Utf8
} else if types.contains(&DataType::Int64) {
DataType::Int64
} else if types.contains(&DataType::Float64) {
DataType::Float64
} else if types.contains(&DataType::Timestamp(TimeUnit::Nanosecond, None)) {
DataType::Timestamp(TimeUnit::Nanosecond, None)
} else {
DataType::Utf8
};
Field::new(headers[i].clone(), data_type, true)
}).collect();
Ok(Schema::new(fields))
}
3.3 类型推断细节
Excel 内部存储的特殊性:
rust
pub(crate) fn infer_cell_data_type(cell: &Data) -> DataType {
match cell {
Data::Int(_) => DataType::Int64,
Data::Float(v) => {
// Excel 内部全存 Float,判断是否为整数
if v.fract() == 0.0 {
DataType::Int64
} else {
DataType::Float64
}
}
Data::DateTime(_) | Data::DateTimeIso(_) =>
DataType::Timestamp(TimeUnit::Nanosecond, None),
_ => DataType::Utf8,
}
}
3.4 日期转换
rust
pub(crate) fn excel_cell_to_timestamp_nanos(cell: &Data) -> Option<i64> {
let naive_dt = match cell {
Data::DateTime(dt) => dt.as_datetime(), // calamine dates feature
Data::DateTimeIso(s) => {
NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S").ok()
.or_else(|| NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S").ok())
.or_else(|| {
NaiveDate::parse_from_str(s, "%Y-%m-%d")
.ok()
.map(|d| d.and_hms_opt(0, 0, 0).unwrap_or_default())
})
}
_ => None,
}?;
DateTime::<Utc>::from_naive_utc_and_offset(naive_dt, Utc).timestamp_nanos_opt()
}
四、SQL 导出引擎
4.1 核心数据结构
rust
#[derive(Clone, Debug, Deserialize)]
pub struct ExportColumnConfig {
pub source_column_name: String,
pub export_column_name: String,
pub sql_type: String,
}
// 预解析枚举,避免热循环中的字符串分配
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub(crate) enum SqlType {
Bool, Int, Float, Text, Unknown,
}
4.2 INSERT 生成
rust
pub async fn generate_sql_inserts(
df: DataFrame,
table_name: &str,
max_values_per_insert: usize,
db_dialect: &Dialect,
export_columns: Option<&[ExportColumnConfig]>,
empty_text_as_null: bool,
) -> AppResult<String> {
let batches = df.collect().await?;
let headers = extract_headers_from_batches(&batches);
let export_specs = resolve_export_specs(&headers, export_columns)?;
// 方言差异:MySQL 反引号 vs PostgreSQL 双引号
let insert_header_template = match db_dialect {
Dialect::MySQL => format!("INSERT INTO `{}` ({}) VALUES\n", table_name, columns),
Dialect::PostgreSQL => format!("INSERT INTO \"{}\" ({}) VALUES\n", table_name, columns),
};
// 批量分段
let chunk_limit = max_values_per_insert.max(1);
let mut pending_rows: Vec<Vec<String>> = Vec::with_capacity(chunk_limit);
extract_rows_from_batches(batches, &export_specs, empty_text_as_null, |row| {
pending_rows.push(row);
if pending_rows.len() == chunk_limit {
flush_chunk(&mut pending_rows)?;
}
Ok(())
})?;
Ok(sql_statements)
}
4.3 类型感知格式化
rust
pub(crate) fn format_cell_for_sql(
formatted_value: &str,
col_type: SqlType,
empty_text_as_null: bool,
) -> String {
if formatted_value == "NULL" {
return "NULL".to_string();
}
// 空文本转 NULL
if empty_text_as_null && formatted_value.is_empty() {
match col_type {
SqlType::Text | SqlType::Unknown => return "NULL".to_string(),
_ => {}
}
}
match col_type {
SqlType::Bool => format_bool_for_sql(formatted_value),
SqlType::Int | SqlType::Float => {
// 解析数值,失败则输出 NULL
if let Ok(i) = formatted_value.parse::<i64>() {
i.to_string()
} else if let Ok(f) = formatted_value.parse::<f64>() {
if f.is_finite() { formatted_value.to_string() } else { "NULL".to_string() }
} else {
"NULL".to_string()
}
}
SqlType::Text | SqlType::Unknown => {
// 转义单引号
format!("'{}'", formatted_value.replace("'", "''"))
}
}
}
五、数据库联查:MySQL / PostgreSQL
EasyDB 不仅支持本地文件,还支持直接连接 MySQL 和 PostgreSQL 进行联查。
5.1 MySQL 连接
rust
pub async fn register_mysql(
ctx: &mut SessionContext,
table_name: &String,
table_path: &String, // 实际是表名
args: &mut Option<TableFunctionArgs>,
) -> AppResult<()> {
let mut conn: Option<String> = None;
// 从命名参数中提取 conn 连接字符串
// ...
let mysql_params = to_secret_map(HashMap::from([
("connection_string".to_string(), conn.unwrap()),
("sslmode".to_string(), "disabled".to_string()),
]));
let mysql_pool = Arc::new(MySQLConnectionPool::new(mysql_params).await?);
let table_factory = MySQLTableFactory::new(mysql_pool);
ctx.register_table(
table_name,
table_factory
.table_provider(TableReference::bare(table_path.clone()))
.await?,
)?;
Ok(())
}
5.2 使用示例
sql
-- 本地 Excel 与 MySQL 联查
SELECT t1.*, t2.email
FROM read_excel('/data/users.xlsx') AS t1
INNER JOIN read_mysql('accounts', conn => 'mysql://root:pass@localhost:3306/db') AS t2
ON t1.user_id = t2.id;
六、错误处理
统一错误类型,从各库错误自动转换:
rust
#[derive(Debug, Display, Error, Clone)]
pub enum AppError {
BadRequest { message: String },
FileNotFound { file_name: String },
InternalServer { message: String },
}
impl From<DataFusionError> for AppError {
fn from(error: DataFusionError) -> Self {
BadRequest { message: error.to_string() }
}
}
impl From<XlsxError> for AppError {
fn from(error: XlsxError) -> Self {
BadRequest { message: error.to_string() }
}
}
impl From<rusqlite::Error> for AppError {
fn from(error: rusqlite::Error) -> Self {
BadRequest { message: error.to_string() }
}
}
// ... 还有 ParserError, ArrowError, mysql::Error, postgres::Error 等
七、性能优化点
- 预分配 Vec capacity:查询结果格式化时先计算总行数,一次性分配内存
- 预解析 SQL 类型枚举:避免热循环中的字符串分配和匹配
- 虚拟滚动:前端只渲染可视行,避免 DOM 爆炸
- 分页查询:默认 1000 行,fetch_page 支持按需加载
- SQLite 本地存储:查询历史持久化,不占用主进程内存
八、开源地址
- GitHub : github.com/shencangshe...
- License: MIT
- 平台: macOS, Windows
欢迎 Star、Issue 和 PR。