企业级大模型训练常见的数据集

企业级大模型训练数据集的特点:

  • 高质量:数据经过清洗和标注,准确无误
  • 大规模:数据量足够大,覆盖业务场景的多样性
  • 安全合规:数据不涉及用户隐私,符合法律法规
  • 领域相关:数据与业务领域紧密相关,例如金融、医疗、法律等

常见的数据格式包括:

  • 文本对:例如问答对、对话历史、平行语料(翻译)等
  • 序列标注:例如命名实体识别、词性标注等
  • 分类标签:例如情感分类、意图分类等
  • 非结构化文本:例如长文档、报告、文章等
json 复制代码
// 单个JSON对象格式
{
  "id": "customer_001",
  "text": "产品使用体验很好,客服响应迅速",
  "label": "positive",
  "category": "feedback",
  "timestamp": "2024-01-15T10:30:00Z",
  "metadata": {
    "product_id": "prod_123",
    "user_tier": "gold",
    "source": "mobile_app"
  }
}

// JSONL格式(每行一个JSON对象)
{"id": "1", "text": "示例文本1", "label": "A"}
{"id": "2", "text": "示例文本2", "label": "B"}
{"id": "3", "text": "示例文本3", "label": "C"}
csv 复制代码
# CSV格式示例
id,text,sentiment,confidence,source,date
1,"产品质量不错,但物流太慢",neutral,0.67,website,2024-01-15
2,"客服态度恶劣,非常不满意",negative,0.92,call_center,2024-01-15
3,"功能强大,操作简单,推荐购买",positive,0.88,app_store,2024-01-14

# TSV格式示例(制表符分隔)
id	text	category	subcategory
1	"如何开通企业账户"	faq	account
2	"发票申请流程"	faq	billing
3	"API调用频率限制"	documentation	technical
xml 复制代码
<!-- 医疗领域数据示例 -->
<patient_record>
    <patient_id>PT-001</patient_id>
    <encounters>
        <encounter>
            <date>2024-01-15</date>
            <type>outpatient</type>
            <symptoms>
                <symptom>fever</symptom>
                <symptom>cough</symptom>
                <symptom>fatigue</symptom>
            </symptoms>
            <diagnosis>influenza</diagnosis>
            <prescriptions>
                <medication name="Oseltamivir" dosage="75mg" frequency="twice daily"/>
            </prescriptions>
        </encounter>
    </encounters>
</patient_record>
json 复制代码
//对话数据格式
{
  "conversation_id": "conv_001",
  "participants": ["customer", "agent"],
  "timestamp": "2024-01-15T10:30:00Z",
  "language": "zh-CN",
  "turns": [
    {
      "speaker": "customer",
      "utterance": "我的订单还没有发货,已经三天了",
      "timestamp": "10:30:00",
      "sentiment": "frustrated",
      "intent": "order_status_inquiry"
    },
    {
      "speaker": "agent",
      "utterance": "抱歉让您久等了,我马上为您查询订单状态",
      "timestamp": "10:31:15",
      "sentiment": "apologetic",
      "intent": "acknowledge_and_assist"
    }
  ],
  "summary": "客户查询订单发货状态,客服承诺跟进",
  "resolution": "pending",
  "metadata": {
    "customer_id": "cust_123",
    "order_id": "ord_456",
    "channel": "live_chat"
  }
}

企业级数据集构建流程

①、数据收集与清洗

py 复制代码
import pandas as pd
import json
import re
from typing import List,Dict,Any
from dataclasses import dataclass
import hashlib

@dataclass
class DataCleaningConfig:
	min_text_length: int = 10
    max_text_length: int = 10000
    remove_special_chars: bool = True
    remove_emails: bool = True
    remove_phone_numbers: bool = True
    deduplicate: bool = True

class EnterpriseDataProcessor:
	"""企业级数据处理类"""
	def __init__(self,config:DataCleaningConfig):
		self.config = config

	def _clean_dataframe(self,df:pd.DataFrame) -> pd.DataFrame:
		"""清洗数据框"""
		# 1.处理缺失值
		df = df.dropna(subset=['text'])

		# 2.清理文本
		if 'text' in df.columns:
			df['cleaned_text'] = df['text'].apply(self._clean_text)
			#过滤长度
			df = df[df['clean_text'].str.len() >= self.config.min_text_length]
			df = df[df['clean_text'].str.len() <= self.config.max_text_length]
		
		# 3.去重
		if self.config.deduplicate:
			df = self._deduplicate_data(df)
		
		# 4.标准化标签
		if 'label' in df.columns:
			df['label'] = df['label'].str.lower().str.strip()

		return df
		
	def load_and_clean_csv(self,file_path:str) -> pd.DataFrame:
		"""加载清洗CSV数据"""
		df = pd.read_csv(file_path)

		#数据清洗步骤
		df = self._clean_dataframe(df)

		return df

	def _clean_text() -> str:
		"""清理单个文本"""
		if pd.isna(text):
			return ""
		
		text = str(text)

		# 移除特殊字符
		if self.config.remove_special_chars:
			text = re.sub(r'[^\w\s.,!?,。!?\u4e00-\u9fff]', ' ', text)

		# 移除邮箱
		if self.config.remove_emails:
			text = re.sub(r'\S+@\S+\.\S+', '[EMAIL]', text)

		# 移除电话号码
		if self.config.remove_phone_numbers:
			text = re.sub(r'\b\d{10,11}\b', '[PHONE]', text)

		# 标准化空白字符
		text = re.sub(r'\s+', ' ', text).strip()
		return text

	def _deduplicate_data(self,df:pd.DataFrame) -> pd.DataFrame:
		"""数据去重"""
		if 'cleaned_text' in df.columns:
			#创建文本哈希用于去重
			df['text_hash'] = df['cleaned_text'].apply(
				lambda x:hashlib.md5(x.encode()).hexdigest()
			)
			df = df.drop_duplicates(subset=['text_hash'])
			df = df.drop(columns=['text_hash'])
		return df

	def convert_to_jsonl(self,df:pd.DataFrame,output_path:str):
		"""转换为JSONL格式"""
		records = df.to_dict('records')

		with open(out_path,'w',encoding='utf-8') as f:
			for record in records:
				json.dump(record,f,ensure_ascii=False)
				f.write('\n')

②、数据标注框架

py 复制代码
class DataAnnotationFramework:
	"""数据标注框架"""
	
	@staticmethod
	def create_annotation_template(task_type:str)->Dict:
		"""创建标注模板"""
		templates = {
			"sentiment_analysis": {
                "text": "",
                "sentiment": {
                    "polarity": "positive/negative/neutral",
                    "intensity": 1.0,  # 0-1
                    "aspects": [
                        {
                            "aspect": "product_quality",
                            "sentiment": "positive",
                            "confidence": 0.95
                        }
                    ]
                },
                "emotion": "happy/angry/sad/surprise",
                "confidence": 0.95
            },
            "intent_classification": {
                "text": "",
                "primary_intent": "order_inquiry",
                "secondary_intents": ["complaint", "information_request"],
                "confidence": 0.92,
                "slots": {
                    "order_number": "12345",
                    "product_name": "laptop"
                }
            },
            "named_entity_recognition": {
                "text": "",
                "entities": [
                    {
                        "text": "Microsoft",
                        "type": "ORG",
                        "start": 10,
                        "end": 19,
                        "confidence": 0.98
                    }
                ]
            },
            "text_classification": {
                "text": "",
                "categories": [
                    {"category": "technical_support", "confidence": 0.95},
                    {"category": "billing", "confidence": 0.85}
                ],
                "urgency": "high/medium/low",
                "channel": "email/chat/phone"
            }
		}
		retrun templates.get(task_type,{"text":"","annotations":{}})

	@staticmethod
	def validate_annotations()->bool:
		"""验证标注质量"""
		validators = {
			"sentiment_analysis": lambda d: 'sentiment' in d and 'polarity' in d['sentiment'],
            "intent_classification": lambda d: 'primary_intent' in d and isinstance(d['primary_intent'], str),
            "named_entity_recognition": lambda d: 'entities' in d and isinstance(d['entities'], list)
		}
		validator = validators.get(task_type)
		return validator(data) if validator else True

企业级场景数据集案例

一、金融风控场景

py 复制代码
# financial_risk_dataset.jsonl
{
  "scenario": "credit_risk_assessment",
  "data_type": "structured + unstructured",
  "sources": [
    "loan_applications",
    "credit_reports",
    "transaction_histories",
    "customer_interactions"
  ],
  "records": [
    {
      "application_id": "APP-2024-001",
      "customer_profile": {
        "age": 35,
        "income": 75000,
        "employment_status": "employed",
        "employment_duration_months": 48,
        "credit_score": 720,
        "debt_to_income_ratio": 0.35
      },
      "loan_details": {
        "amount": 25000,
        "term_months": 36,
        "purpose": "home_renovation",
        "collateral": "none"
      },
      "text_data": {
        "application_statement": "申请贷款用于房屋装修,计划在三个月内完成工程。",
        "customer_service_interaction": "客户对利率表示关注,询问提前还款政策。"
      },
      "historical_data": {
        "previous_loans": [
          {"amount": 10000, "status": "repaid", "delinquencies": 0},
          {"amount": 5000, "status": "repaid", "delinquencies": 1}
        ],
        "transaction_patterns": {
          "average_monthly_spending": 3500,
          "savings_rate": 0.25,
          "irregular_transactions": 3
        }
      },
      "risk_indicators": {
        "late_payments_last_year": 2,
        "credit_inquiries_last_6_months": 3,
        "utilization_ratio": 0.45
      },
      "annotations": {
        "risk_score": 0.35,  # 0-1, 越高风险越大
        "risk_category": "medium",
        "approval_recommendation": "approve_with_conditions",
        "recommended_conditions": ["higher_interest_rate", "shorter_term"],
        "manual_review_reason": "multiple_recent_inquiries",
        "expert_notes": "稳定的就业历史,但近期信用查询较多"
      },
      "outcome": {
        "approved": true,
        "actual_terms": {
          "interest_rate": 7.5,
          "approved_amount": 20000
        },
        "performance": {
          "repayment_status": "current",
          "days_past_due": 0,
          "early_repayment": false
        }
      },
      "metadata": {
        "collection_date": "2024-01-15",
        "data_source": "loan_system_v2",
        "sensitive_info_removed": true,
        "compliance_checked": true,
        "gdpr_compliant": true
      }
    }
  ]
}

# financial_conversations_dataset.jsonl
{
  "conversation_id": "FIN-CONV-001",
  "channel": "phone_call_transcript",
  "participants": ["customer", "financial_advisor"],
  "context": {
    "customer_segment": "high_net_worth",
    "topic": "investment_portfolio_review",
    "customer_mood": "concerned",
    "advisor_role": "senior_advisor"
  },
  "turns": [
    {
      "speaker": "customer",
      "utterance": "最近市场波动很大,我很担心我的投资组合",
      "sentiment": "anxious",
      "intent": "portfolio_concern",
      "financial_terms": ["市场波动", "投资组合"],
      "risk_tolerance_indicator": "low"
    },
    {
      "speaker": "financial_advisor",
      "utterance": "我理解您的担忧。让我们一起来看看您的资产配置,根据您的风险承受能力做适当调整",
      "sentiment": "reassuring",
      "intent": "provide_reassurance_and_solution",
      "advice_type": "portfolio_review",
      "compliance_check_passed": true
    }
  ],
  "annotations": {
    "financial_topics": ["market_volatility", "portfolio_management", "risk_assessment"],
    "customer_needs": ["reassurance", "expert_advice", "portfolio_optimization"],
    "advisor_effectiveness": 0.85,
    "compliance_violations": [],
    "recommended_actions": [
      "schedule_follow_up_meeting",
      "provide_quarterly_performance_report"
    ]
  }
}

二、医疗诊断辅助场景

py 复制代码
# medical_diagnosis_dataset.jsonl
{
  "dataset_type": "electronic_health_records",
  "data_schema_version": "2.0",
  "privacy_level": "de-identified",
  "records": [
    {
      "patient_id": "PT-001-ANON",
      "demographics": {
        "age_group": "35-44",
        "gender": "female",
        "region": "urban"
      },
      "visit_information": {
        "visit_id": "VISIT-001",
        "visit_date": "2024-01-15",
        "visit_type": "outpatient",
        "chief_complaint": "持续性头痛三天,伴有恶心"
      },
      "symptoms": {
        "subjective": [
          {
            "symptom": "headache",
            "description": "持续性钝痛,前额区域",
            "severity": 7,  # 1-10
            "duration_hours": 72,
            "onset": "gradual",
            "aggravating_factors": ["bright_lights", "noise"],
            "relieving_factors": ["rest", "dark_room"]
          },
          {
            "symptom": "nausea",
            "description": "轻度恶心,未呕吐",
            "severity": 4,
            "relation_to_meals": "unrelated"
          }
        ],
        "objective": {
          "vital_signs": {
            "blood_pressure": "130/85",
            "heart_rate": 88,
            "temperature": 37.2,
            "respiratory_rate": 16
          },
          "physical_exam": {
            "neurological": "normal",
            "fundoscopic_exam": "no_papilledema"
          }
        }
      },
      "medical_history": {
        "past_conditions": ["migraine", "hypertension"],
        "medications": ["propranolol 40mg daily"],
        "allergies": ["penicillin"],
        "family_history": {
          "migraine": "mother",
          "hypertension": "father"
        }
      },
      "diagnostic_tests": {
        "ordered": ["ct_head"],
        "results": [
          {
            "test": "ct_head",
            "result": "normal",
            "impression": "No acute intracranial abnormality"
          }
        ]
      },
      "differential_diagnoses": [
        {
          "condition": "migraine",
          "probability": 0.85,
          "supporting_evidence": [
            "history_of_migraine",
            "characteristic_headache_pattern",
            "photophobia_and_phonophobia"
          ],
          "contradicting_evidence": [
            "normal_neurological_exam",
            "no_aura_present"
          ]
        },
        {
          "condition": "tension_headache",
          "probability": 0.10,
          "supporting_evidence": ["bilateral_pain", "no_nausea"]
        },
        {
          "condition": "sinusitis",
          "probability": 0.05,
          "supporting_evidence": ["facial_pain"]
        }
      ],
      "final_diagnosis": {
        "primary": "migraine_without_aura",
        "icd10_code": "G43.009",
        "confidence": 0.90
      },
      "treatment_plan": {
        "acute_management": [
          {
            "medication": "sumatriptan",
            "dose": "50mg",
            "instructions": "take at onset of headache, max 2 doses per day"
          }
        ],
        "preventive_measures": [
          "maintain_regular_sleep_schedule",
          "identify_and_avoid_triggers",
          "stress_management_techniques"
        ],
        "follow_up": "return_if_symptoms_worsen_or_change"
      },
      "doctor_notes": {
        "assessment": "患者表现为典型偏头痛特征,无警示症状",
        "plan": "给予曲坦类药物治疗急性发作,建议预防措施",
        "patient_education": "解释了偏头痛的触发因素和管理策略"
      },
      "annotations_for_ai": {
        "symptom_entities": [
          {"text": "头痛", "type": "SYMPTOM", "normalized": "headache"},
          {"text": "恶心", "type": "SYMPTOM", "normalized": "nausea"}
        ],
        "diagnosis_certainty": "high",
        "treatment_appropriateness": "appropriate",
        "risk_factors": ["family_history", "stress"],
        "red_flags_absent": true
      }
    }
  ]
}

# medical_qa_dataset.jsonl
{
  "question_id": "MED-QA-001",
  "question": "偏头痛和紧张性头痛有什么区别?",
  "question_source": "patient_portal",
  "question_context": "患者有间歇性头痛,想了解不同类型头痛的特征",
  "expert_answer": {
    "text": "偏头痛通常表现为单侧搏动性中重度头痛,持续4-72小时,常伴有恶心、呕吐、畏光、畏声。紧张性头痛则多为双侧压迫性或紧束性轻度中度头痛,不伴有恶心呕吐,日常活动不加重头痛。",
    "components": {
      "migraine_characteristics": [
        "unilateral_throbbing_pain",
        "moderate_to_severe_intensity",
        "duration_4_72_hours",
        "accompanied_by_nausea_vomiting",
        "photophobia_phonophobia"
      ],
      "tension_headache_characteristics": [
        "bilateral_pressure_pain",
        "mild_to_moderate_intensity",
        "no_nausea_vomiting",
        "not_aggravated_by_routine_activity"
      ],
      "differential_points": [
        "pain_location",
        "pain_quality",
        "associated_symptoms",
        "impact_on_activity"
      ]
    }
  },
  "simplified_answer_for_patient": "偏头痛通常是头的一侧搏动性剧痛,可能伴随恶心怕光;紧张性头痛则是头两侧的压迫感,一般不恶心。",
  "metadata": {
    "medical_specialty": "neurology",
    "difficulty_level": "intermediate",
    "verification_status": "verified_by_neurologist",
    "references": ["ICHD-3 criteria", "uptodate.com"]
  }
}

三、智能客服场景

py 复制代码
# customer_service_dataset.jsonl
{
  "dataset_name": "multilingual_customer_service",
  "version": "3.0",
  "description": "多语言客户服务对话数据集",
  "records": [
    {
      "interaction_id": "CS-INT-001",
      "language": "zh-CN",
      "channel": "live_chat",
      "business_domain": "ecommerce",
      "customer_profile": {
        "tier": "gold_member",
        "tenure_months": 24,
        "lifetime_value": 15000,
        "recent_purchases": 5
      },
      "conversation": {
        "turns": [
          {
            "turn_id": 1,
            "speaker": "customer",
            "message": "我上周买的手机屏幕有划痕,想要退货",
            "timestamp": "2024-01-15T10:30:00Z",
            "sentiment": "frustrated",
            "intent": "return_request",
            "entities": [
              {"text": "手机", "type": "PRODUCT", "confidence": 0.98},
              {"text": "上周", "type": "TIME", "normalized": "7_days_ago"},
              {"text": "屏幕划痕", "type": "ISSUE", "confidence": 0.95}
            ]
          },
          {
            "turn_id": 2,
            "speaker": "agent",
            "message": "非常抱歉给您带来不便。请提供订单号,我为您处理退货申请",
            "timestamp": "2024-01-15T10:31:15Z",
            "sentiment": "empathetic",
            "intent": "request_information",
            "action": "acknowledge_issue_and_request_order_id"
          },
          {
            "turn_id": 3,
            "speaker": "customer",
            "message": "订单号是ORD-123456",
            "timestamp": "2024-01-15T10:32:00Z",
            "intent": "provide_information",
            "entities": [
              {"text": "ORD-123456", "type": "ORDER_ID", "confidence": 0.99}
            ]
          },
          {
            "turn_id": 4,
            "speaker": "agent",
            "message": "已收到。根据您的订单,商品在7天无理由退货期内。我将为您生成退货标签,快递员会在明天上门取件。退款将在商品质检后3-5个工作日内处理。",
            "timestamp": "2024-01-15T10:33:30Z",
            "sentiment": "helpful",
            "intent": "provide_solution",
            "actions": [
              "check_return_policy",
              "generate_return_label",
              "schedule_pickup",
              "explain_refund_timeline"
            ]
          }
        ],
        "summary": "客户因产品质量问题要求退货,客服处理了退货申请并安排取件",
        "resolution": "return_approved",
        "resolution_time_minutes": 3.5,
        "customer_satisfaction_score": 4  # 1-5
      },
      "business_rules_applied": [
        "7_day_return_policy",
        "free_return_shipping_for_defects",
        "refund_after_quality_check"
      ],
      "escalation_info": {
        "escalated": false,
        "escalation_reason": null,
        "escalation_level": null
      },
      "annotations_for_training": {
        "intent_hierarchy": [
          {"primary": "return_request", "secondary": ["defective_product"]}
        ],
        "emotion_trajectory": ["frustrated", "calm", "satisfied"],
        "agent_skill_used": ["policy_knowledge", "empathetic_response", "process_execution"],
        "conversation_quality_score": 0.88,
        "key_phrases": [
          "屏幕有划痕",
          "提供订单号",
          "退货申请",
          "上门取件",
          "退款处理"
        ]
      },
      "metadata": {
        "collection_method": "real_production_data",
        "privacy_compliance": "gdpr_compliant",
        "personally_identifiable_information_removed": true,
        "annotated_by": ["senior_agent", "quality_team"],
        "annotation_agreement": 0.92
      }
    }
  ]
}

# customer_feedback_dataset.jsonl
{
  "feedback_id": "FB-001",
  "source": "product_review",
  "platform": "mobile_app_store",
  "language": "zh-CN",
  "product_info": {
    "product_id": "PROD-123",
    "product_name": "智能手表X200",
    "category": "wearables",
    "purchase_date": "2024-01-01"
  },
  "feedback_text": "电池续航非常好,能用5天不充电。但是表带设计不太舒适,戴久了手腕会疼。心率监测准确,睡眠分析功能很有用。",
  "structured_feedback": {
    "aspects": [
      {
        "aspect": "battery_life",
        "sentiment": "positive",
        "intensity": 0.9,
        "mention": "电池续航非常好,能用5天不充电"
      },
      {
        "aspect": "comfort",
        "sentiment": "negative",
        "intensity": 0.7,
        "mention": "表带设计不太舒适,戴久了手腕会疼"
      },
      {
        "aspect": "health_features",
        "sentiment": "positive",
        "intensity": 0.8,
        "mention": "心率监测准确,睡眠分析功能很有用"
      }
    ],
    "overall_sentiment": "mixed",
    "overall_rating": 4,  # 1-5
    "recommendation": true
  },
  "customer_profile": {
    "segment": "tech_enthusiast",
    "usage_frequency": "daily",
    "primary_use_case": "fitness_tracking"
  },
  "annotations": {
    "feature_requests": ["improved_band_comfort"],
    "bug_reports": [],
    "usability_issues": ["band_discomfort"],
    "competitive_advantages": ["battery_life", "health_monitoring"],
    "priority_level": "medium",
    "assigned_department": "product_design"
  },
  "business_impact": {
    "potential_churn_risk": "low",
    "upsell_opportunity": "medium",
    "product_improvement_priority": "high"
  }
}

四、法律文档分析场景

py 复制代码
# legal_contract_dataset.jsonl
{
  "document_id": "CONTRACT-001",
  "document_type": "software_license_agreement",
  "industry": "technology",
  "jurisdiction": "California, USA",
  "language": "en",
  "document_text": "This Software License Agreement (the 'Agreement') is entered into on January 15, 2024 (the 'Effective Date') between ABC Technologies, Inc. ('Licensor') and XYZ Corporation ('Licensee').\n\n1. License Grant. Subject to the terms of this Agreement, Licensor grants Licensee a non-exclusive, non-transferable license to use the Software...\n\n2. Term. This Agreement shall commence on the Effective Date and continue for a period of three (3) years...\n\n3. Fees. Licensee shall pay Licensor an annual license fee of $50,000...",
  "structured_analysis": {
    "parties": [
      {
        "name": "ABC Technologies, Inc.",
        "role": "Licensor",
        "type": "corporation"
      },
      {
        "name": "XYZ Corporation",
        "role": "Licensee",
        "type": "corporation"
      }
    ],
    "key_clauses": [
      {
        "clause_number": "1",
        "clause_title": "License Grant",
        "clause_text": "Subject to the terms of this Agreement, Licensor grants Licensee a non-exclusive, non-transferable license to use the Software...",
        "clause_type": "grant",
        "risk_level": "low",
        "standardization_score": 0.85
      },
      {
        "clause_number": "3",
        "clause_title": "Fees",
        "clause_text": "Licensee shall pay Licensor an annual license fee of $50,000...",
        "clause_type": "financial",
        "risk_level": "medium",
        "financial_terms": [
          {
            "term": "annual_license_fee",
            "amount": 50000,
            "currency": "USD",
            "payment_frequency": "annual"
          }
        ]
      }
    ],
    "risk_assessment": {
      "overall_risk_score": 0.35,  # 0-1
      "high_risk_clauses": [],
      "medium_risk_clauses": ["3"],
      "low_risk_clauses": ["1", "2"],
      "missing_clauses": ["indemnification", "limitation_of_liability"],
      "unusual_provisions": []
    },
    "compliance_check": {
      "gdpr_compliance": "partial",
      "data_protection_clauses": "insufficient",
      "termination_rights": "balanced",
      "jurisdiction_appropriateness": "appropriate"
    }
  },
  "annotations": {
    "legal_concepts": [
      {"concept": "license_grant", "mentions": 3},
      {"concept": "term_duration", "mentions": 2},
      {"concept": "payment_obligation", "mentions": 1}
    ],
    "obligations": [
      {
        "obligor": "Licensee",
        "obligation": "pay_license_fee",
        "condition": "annually",
        "amount": "$50,000"
      },
      {
        "obligor": "Licensor",
        "obligation": "grant_software_license",
        "condition": "non-exclusive, non-transferable"
      }
    ],
    "rights": [
      {
        "right_holder": "Licensee",
        "right": "use_software",
        "limitations": ["non-exclusive", "non-transferable"]
      }
    ],
    "dates": [
      {"date_type": "effective_date", "value": "2024-01-15"},
      {"date_type": "agreement_term", "value": "3_years"}
    ]
  },
  "expert_review": {
    "reviewer": "senior_counsel",
    "review_date": "2024-01-16",
    "recommendations": [
      "Add indemnification clause",
      "Include limitation of liability provisions",
      "Specify dispute resolution mechanism"
    ],
    "negotiation_points": [
      "Negotiate fee structure",
      "Request broader license rights",
      "Include service level agreements"
    ]
  }
}

五、人力资源招聘场景

py 复制代码
# hr_recruitment_dataset.jsonl
{
  "dataset_type": "resume_and_job_matching",
  "records": [
    {
      "resume_id": "RESUME-001",
      "candidate_info": {
        "candidate_id": "CAND-001",
        "years_experience": 5,
        "education_level": "master",
        "current_location": "San Francisco, CA",
        "willing_to_relocate": true,
        "salary_expectation": 150000,
        "notice_period_days": 30
      },
      "resume_text": "Senior Data Scientist with 5 years of experience in machine learning and big data technologies. Led multiple AI projects from conception to deployment...",
      "structured_resume": {
        "contact_info": {
          "name": "Jane Doe",
          "email": "jane.doe@example.com",
          "phone": "+1-555-0123"
        },
        "work_experience": [
          {
            "company": "TechCorp Inc.",
            "position": "Senior Data Scientist",
            "duration": "2020-Present",
            "responsibilities": [
              "Led development of recommendation systems improving CTR by 25%",
              "Managed team of 3 junior data scientists",
              "Implemented ML pipelines processing 1TB+ data daily"
            ],
            "technologies": ["Python", "TensorFlow", "Spark", "AWS"]
          }
        ],
        "education": [
          {
            "institution": "Stanford University",
            "degree": "M.S. Computer Science",
            "graduation_year": 2019,
            "gpa": 3.8
          }
        ],
        "skills": {
          "technical": [
            {"skill": "Machine Learning", "level": "expert", "years": 5},
            {"skill": "Python", "level": "expert", "years": 7},
            {"skill": "Data Visualization", "level": "advanced", "years": 4}
          ],
          "soft_skills": ["leadership", "communication", "problem_solving"]
        },
        "certifications": ["AWS Certified Machine Learning Specialty"],
        "projects": [
          {
            "name": "Customer Churn Prediction",
            "description": "Built ML model to predict customer churn with 85% accuracy",
            "impact": "Reduced churn by 15%"
          }
        ]
      },
      "job_application": {
        "job_id": "JOB-001",
        "job_title": "Lead Machine Learning Engineer",
        "company": "AI Innovations Ltd.",
        "job_description": "We are seeking an experienced Machine Learning Engineer to lead our AI initiatives...",
        "requirements": {
          "required_skills": ["Machine Learning", "Python", "Cloud Platforms"],
          "required_experience_years": 5,
          "required_education": "Master's degree",
          "preferred_skills": ["Leadership", "TensorFlow", "Big Data"]
        },
        "compensation": {
          "salary_range_min": 140000,
          "salary_range_max": 180000,
          "equity": true,
          "benefits": ["health_insurance", "401k_matching", "remote_work"]
        }
      },
      "matching_analysis": {
        "skills_match_score": 0.92,
        "experience_match_score": 0.95,
        "education_match_score": 1.0,
        "culture_fit_score": 0.85,
        "compensation_alignment": 0.90,
        "overall_match_percentage": 0.91,
        "strengths": [
          "Strong technical background",
          "Relevant industry experience",
          "Leadership experience"
        ],
        "gaps": [
          "Limited experience with specific cloud platform",
          "No experience in current company's industry"
        ],
        "recommendation": "strong_candidate",
        "interview_priority": "high"
      },
      "recruitment_process": {
        "screening_status": "passed",
        "interview_stages": [
          {
            "stage": "phone_screen",
            "date": "2024-01-10",
            "result": "passed",
            "feedback": "Excellent communication skills and deep technical knowledge"
          },
          {
            "stage": "technical_interview",
            "date": "2024-01-15",
            "result": "passed",
            "technical_assessment_score": 88
          }
        ],
        "hiring_decision": "pending",
        "expected_timeline": "2_weeks"
      },
      "diversity_inclusion_metrics": {
        "gender": "female",
        "ethnicity": "asian",
        "age_group": "30-39",
        "veteran_status": "no",
        "disability_status": "no"
      }
    }
  ]
}

数据集质量评估与验证

数据集质量检查

py 复制代码
class DatasetQualityChecker:
	"""数据集质量检查器"""
	def __init__(self,dataset_path:str):
		self.dataset_path = dataset_path

	def run_quality_checks(self)->Dict[str,Any]:
		"""运行完整质量检查"""
		checks = {
			"basic_statistics": self.check_basic_statistics(),
            "text_quality": self.check_text_quality(),
            "label_distribution": self.check_label_distribution(),
            "duplicates": self.check_duplicates(),
            "missing_values": self.check_missing_values(),
            "annotation_consistency": self.check_annotation_consistency()
		}
		return self.generate_quality_report(checks)

		def check_text_quality(self)->Dict:
			"""检查文本质量"""
	        metrics = {
	            "avg_length": 0,
	            "min_length": float('inf'),
	            "max_length": 0,
	            "avg_words_per_sentence": 0,
	            "readability_scores": {},
	            "language_distribution": {},
	            "special_char_ratio": 0
	        }
	        
	        # 读取数据并计算
	        data = self.load_data()
	        
	        for item in data:
	            text = item.get('text', '')
	            # 计算各种文本质量指标
	            length = len(text)
	            metrics["avg_length"] += length
	            metrics["min_length"] = min(metrics["min_length"], length)
	            metrics["max_length"] = max(metrics["max_length"], length)
	            
	            # 检查语言
	            lang = self.detect_language(text)
	            metrics["language_distribution"][lang] = metrics["language_distribution"].get(lang, 0) + 1
	        
	        metrics["avg_length"] /= len(data)
	        
	        return metrics
	
	def check_label_distribution(self)->Dict:
		"""检查"""	
		data = self.load_data()
		label_counts = {}

		for item in data:
			label = item.get('label')
			if label:
				label_counts[label] = label_counts.get(label,0) + 1
		#计算类别不平衡度
		total = sum(label_counts.values())
		imbalance_ratio = max(label_counts.values())/min(label_counts.values())		
	
		return{
			"label_counts": label_counts,
            "total_samples": total,
            "num_classes": len(label_counts),
            "imbalance_ratio": imbalance_ratio,
            "class_distribution": {k: v/total for k, v in label_counts.items()}
		}

	def check_annotation_consistency(self) -> Dict:
		"""检查标注一致性"""
		data = self.load_data()

		# 检查标注格式一致性
        inconsistencies = []
        for item in data:
        	#检查必要的字段是否存在
        	required_fields = ['id','text']
        	for field in required_fields:
        		if field not in item:
        			inconsistencies.append(f"Missing {field} in item {item.get('id', 'unknown')}")
			#检查标注格式
			if 'annotations' in item:
				f not isinstance(item['annotations'], dict):
                    inconsistencies.append(f"Invalid annotations format in {item['id']}")
		return {
            "total_items": len(data),
            "inconsistencies_found": len(inconsistencies),
            "consistency_score": 1 - (len(inconsistencies) / len(data)),
            "issues": inconsistencies[:10]  # 只显示前10个问题
        }

	def generate_quality_report(self, checks: Dict) -> Dict:
        """生成质量报告"""
        report = {
            "dataset_info": {
                "path": self.dataset_path,
                "size_gb": self.get_file_size(),
                "num_records": checks["basic_statistics"].get("num_records", 0)
            },
            "quality_metrics": {
                "text_quality_score": self.calculate_text_quality_score(checks["text_quality"]),
                "label_balance_score": self.calculate_balance_score(checks["label_distribution"]),
                "duplication_rate": checks["duplicates"].get("duplication_rate", 0),
                "completeness_score": 1 - (checks["missing_values"].get("missing_count", 0) / 
                                          checks["basic_statistics"].get("total_fields", 1)),
                "consistency_score": checks["annotation_consistency"].get("consistency_score", 1)
            },
            "issues": {
                "critical": [],
                "warning": [],
                "info": []
            },
            "recommendations": []
        }
        
        # 根据检查结果添加问题和建议
        if checks["label_distribution"]["imbalance_ratio"] > 10:
            report["issues"]["warning"].append("严重的类别不平衡")
            report["recommendations"].append("考虑对少数类进行过采样或对多数类进行欠采样")
        
        if checks["text_quality"]["avg_length"] < 20:
            report["issues"]["warning"].append("文本过短")
            report["recommendations"].append("考虑过滤或合并过短的文本")
        
        return report