构建一个简单的亿级数据迁移方案案例,结合Spring AI框架实现这个。
场景案例:电商平台用户行为数据迁移与AI分析
假设我们需要将一个电商平台的10亿条用户行为数据从传统MySQL迁移到分布式存储系统,并结合Spring AI进行实时推荐分析。
1. 项目结构设计
css
data-migration-ai/
├── src/main/java/
│ ├── config/
│ │ ├── DataSourceConfig.java
│ │ ├── SpringAIConfig.java
│ │ └── BatchConfig.java
│ ├── entity/
│ │ ├── UserBehavior.java
│ │ └── UserBehaviorTarget.java
│ ├── repository/
│ │ ├── SourceRepository.java
│ │ └── TargetRepository.java
│ ├── service/
│ │ ├── MigrationService.java
│ │ └── AIAnalysisService.java
│ └── batch/
│ ├── UserBehaviorProcessor.java
│ └── UserBehaviorWriter.java
└── pom.xml
2. Maven依赖配置
xml
<?xml version="1.0" encoding="UTF-8"?>
<pom.xml>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>3.2.0</version>
</parent>
<dependencies>
<!-- Spring Boot Starters -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-batch</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<!-- Spring AI -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-openai-spring-boot-starter</artifactId>
<version>0.8.1</version>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-embedding-store-spring-boot-starter</artifactId>
<version>0.8.1</version>
</dependency>
<!-- 数据库驱动 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongodb-driver-sync</artifactId>
</dependency>
<!-- 分布式存储 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.4</version>
</dependency>
<!-- 缓存 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<!-- 监控 -->
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
</dependencies>
</pom.xml>
3. 数据源配置
java
@Configuration
@EnableConfigurationProperties
public class DataSourceConfig {
@Primary
@Bean(name = "sourceDataSource")
@ConfigurationProperties(prefix = "spring.datasource.source")
public DataSource sourceDataSource() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl("jdbc:mysql://source-db:3306/ecommerce");
config.setUsername("source_user");
config.setPassword("source_password");
config.setMaximumPoolSize(50);
config.setMinimumIdle(10);
config.setConnectionTimeout(30000);
config.setIdleTimeout(600000);
config.setMaxLifetime(1800000);
return new HikariDataSource(config);
}
@Bean(name = "targetDataSource")
@ConfigurationProperties(prefix = "spring.datasource.target")
public DataSource targetDataSource() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl("jdbc:mysql://target-db:3306/ecommerce_new");
config.setUsername("target_user");
config.setPassword("target_password");
config.setMaximumPoolSize(100);
config.setMinimumIdle(20);
return new HikariDataSource(config);
}
@Bean
public MongoTemplate mongoTemplate() {
return new MongoTemplate(MongoClients.create(
"mongodb://mongo-cluster:27017"), "user_behavior");
}
@Bean
public RedisTemplate<String, Object> redisTemplate() {
RedisTemplate<String, Object> template = new RedisTemplate<>();
template.setConnectionFactory(jedisConnectionFactory());
template.setKeySerializer(new StringRedisSerializer());
template.setValueSerializer(new GenericJackson2JsonRedisSerializer());
return template;
}
@Bean
public JedisConnectionFactory jedisConnectionFactory() {
JedisConnectionFactory factory = new JedisConnectionFactory();
factory.setHostName("redis-cluster");
factory.setPort(6379);
factory.setUsePool(true);
return factory;
}
}
4. Spring AI配置
java
@Configuration
@EnableConfigurationProperties(OpenAiProperties.class)
public class SpringAIConfig {
@Bean
public OpenAiChatClient openAiChatClient(OpenAiProperties properties) {
return new OpenAiChatClient(OpenAiApi.builder()
.apiKey(properties.getApiKey())
.baseUrl(properties.getBaseUrl())
.build());
}
@Bean
public OpenAiEmbeddingClient openAiEmbeddingClient(OpenAiProperties properties) {
return new OpenAiEmbeddingClient(OpenAiApi.builder()
.apiKey(properties.getApiKey())
.baseUrl(properties.getBaseUrl())
.build());
}
@Bean
public VectorStore vectorStore(OpenAiEmbeddingClient embeddingClient) {
return new SimpleVectorStore(embeddingClient);
}
@Bean
public PromptTemplate recommendationPromptTemplate() {
String template = """
基于用户行为数据,为用户ID {userId} 生成个性化推荐:
用户最近浏览的商品类别:{categories}
用户购买历史:{purchaseHistory}
用户行为模式:{behaviorPattern}
请生成5个最相关的商品推荐,包括推荐理由。
""";
return new PromptTemplate(template);
}
}
5. 实体类定义
java
@Entity
@Table(name = "user_behavior")
public class UserBehavior {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
@Column(name = "user_id")
private Long userId;
@Column(name = "product_id")
private Long productId;
@Column(name = "behavior_type")
@Enumerated(EnumType.STRING)
private BehaviorType behaviorType;
@Column(name = "category_id")
private Long categoryId;
@Column(name = "session_id")
private String sessionId;
@JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@Column(name = "timestamp")
private LocalDateTime timestamp;
@Column(name = "duration")
private Integer duration;
@Column(name = "device_type")
private String deviceType;
@Column(name = "ip_address")
private String ipAddress;
// 构造函数、getter、setter省略
public enum BehaviorType {
VIEW, CLICK, ADD_TO_CART, PURCHASE, FAVORITE, SHARE, SEARCH
}
}
@Document(collection = "user_behavior_analytics")
public class UserBehaviorTarget {
@Id
private String id;
private Long userId;
private Long productId;
private String behaviorType;
private Long categoryId;
private String sessionId;
private LocalDateTime timestamp;
private Integer duration;
private String deviceType;
private String ipAddress;
// AI分析结果字段
private Double relevanceScore;
private List<String> recommendedProducts;
private String userProfile;
private Map<String, Object> behaviorInsights;
// 构造函数、getter、setter省略
}
6. 核心迁移服务
java
@Service
@Transactional
@Slf4j
public class MigrationService {
@Autowired
@Qualifier("sourceDataSource")
private DataSource sourceDataSource;
@Autowired
private MongoTemplate mongoTemplate;
@Autowired
private RedisTemplate<String, Object> redisTemplate;
@Autowired
private AIAnalysisService aiAnalysisService;
@Autowired
private JobLauncher jobLauncher;
@Autowired
private Job migrationJob;
private final Counter migrationCounter = Counter.builder("migration.records")
.description("Number of migrated records")
.register(Metrics.globalRegistry);
private final Timer migrationTimer = Timer.builder("migration.duration")
.description("Migration duration")
.register(Metrics.globalRegistry);
public void startMigration() {
Timer.Sample sample = Timer.start(Metrics.globalRegistry);
try {
log.info("开始亿级数据迁移任务...");
JobParameters jobParameters = new JobParametersBuilder()
.addLong("startTime", System.currentTimeMillis())
.addString("migrationId", UUID.randomUUID().toString())
.toJobParameters();
JobExecution jobExecution = jobLauncher.run(migrationJob, jobParameters);
log.info("迁移任务状态: {}", jobExecution.getStatus());
} catch (Exception e) {
log.error("迁移任务执行失败", e);
throw new RuntimeException("Migration failed", e);
} finally {
sample.stop(migrationTimer);
}
}
@Async
public CompletableFuture<Void> processChunk(List<UserBehavior> behaviors) {
return CompletableFuture.runAsync(() -> {
List<UserBehaviorTarget> targets = new ArrayList<>();
for (UserBehavior behavior : behaviors) {
try {
// 数据转换
UserBehaviorTarget target = convertToTarget(behavior);
// AI分析增强
enhanceWithAI(target);
targets.add(target);
migrationCounter.increment();
} catch (Exception e) {
log.error("处理记录失败: {}", behavior.getId(), e);
}
}
// 批量写入MongoDB
if (!targets.isEmpty()) {
mongoTemplate.insertAll(targets);
// 更新缓存
updateCache(targets);
}
});
}
private UserBehaviorTarget convertToTarget(UserBehavior source) {
UserBehaviorTarget target = new UserBehaviorTarget();
target.setUserId(source.getUserId());
target.setProductId(source.getProductId());
target.setBehaviorType(source.getBehaviorType().name());
target.setCategoryId(source.getCategoryId());
target.setSessionId(source.getSessionId());
target.setTimestamp(source.getTimestamp());
target.setDuration(source.getDuration());
target.setDeviceType(source.getDeviceType());
target.setIpAddress(source.getIpAddress());
return target;
}
private void enhanceWithAI(UserBehaviorTarget target) {
try {
// 使用AI分析用户行为模式
Map<String, Object> insights = aiAnalysisService
.analyzeUserBehavior(target);
target.setBehaviorInsights(insights);
target.setRelevanceScore((Double) insights.get("relevanceScore"));
target.setUserProfile((String) insights.get("userProfile"));
} catch (Exception e) {
log.warn("AI分析失败,使用默认值: {}", e.getMessage());
target.setRelevanceScore(0.5);
}
}
private void updateCache(List<UserBehaviorTarget> targets) {
for (UserBehaviorTarget target : targets) {
String key = "user_behavior:" + target.getUserId();
redisTemplate.opsForList().rightPush(key, target);
redisTemplate.expire(key, Duration.ofHours(24));
}
}
}
7. AI分析服务
java
@Service
@Slf4j
public class AIAnalysisService {
@Autowired
private OpenAiChatClient chatClient;
@Autowired
private OpenAiEmbeddingClient embeddingClient;
@Autowired
private VectorStore vectorStore;
@Autowired
private PromptTemplate recommendationPromptTemplate;
@Autowired
private MongoTemplate mongoTemplate;
public Map<String, Object> analyzeUserBehavior(UserBehaviorTarget behavior) {
Map<String, Object> insights = new HashMap<>();
try {
// 1. 获取用户历史行为
List<UserBehaviorTarget> userHistory = getUserBehaviorHistory(
behavior.getUserId(), 30);
// 2. 生成用户行为特征向量
List<Double> behaviorVector = generateBehaviorVector(behavior, userHistory);
// 3. 计算相关性得分
double relevanceScore = calculateRelevanceScore(behaviorVector);
// 4. 生成用户画像
String userProfile = generateUserProfile(userHistory);
// 5. 生成推荐
List<String> recommendations = generateRecommendations(
behavior.getUserId(), userHistory);
insights.put("relevanceScore", relevanceScore);
insights.put("userProfile", userProfile);
insights.put("recommendations", recommendations);
insights.put("behaviorVector", behaviorVector);
insights.put("analysisTimestamp", LocalDateTime.now());
} catch (Exception e) {
log.error("AI分析失败", e);
insights.put("error", e.getMessage());
}
return insights;
}
private List<UserBehaviorTarget> getUserBehaviorHistory(Long userId, int days) {
Query query = new Query(Criteria.where("userId").is(userId)
.and("timestamp").gte(LocalDateTime.now().minusDays(days)));
query.limit(1000);
return mongoTemplate.find(query, UserBehaviorTarget.class);
}
private List<Double> generateBehaviorVector(UserBehaviorTarget behavior,
List<UserBehaviorTarget> history) {
// 构建行为描述文本
String behaviorText = buildBehaviorText(behavior, history);
// 使用OpenAI Embedding生成向量
EmbeddingRequest request = new EmbeddingRequest(
List.of(behaviorText),
EmbeddingOptions.builder()
.withModel("text-embedding-ada-002")
.build());
EmbeddingResponse response = embeddingClient.embedForResponse(request);
return response.getResults().get(0).getOutput();
}
private String buildBehaviorText(UserBehaviorTarget behavior,
List<UserBehaviorTarget> history) {
StringBuilder text = new StringBuilder();
text.append("用户行为: ").append(behavior.getBehaviorType())
.append(", 商品类别: ").append(behavior.getCategoryId())
.append(", 设备: ").append(behavior.getDeviceType())
.append(", 时长: ").append(behavior.getDuration());
// 添加历史行为模式
Map<String, Long> behaviorCounts = history.stream()
.collect(Collectors.groupingBy(
UserBehaviorTarget::getBehaviorType,
Collectors.counting()));
text.append(", 历史行为模式: ").append(behaviorCounts);
return text.toString();
}
private double calculateRelevanceScore(List<Double> behaviorVector) {
// 使用向量相似度计算相关性得分
// 这里简化为向量模长的归一化值
double magnitude = behaviorVector.stream()
.mapToDouble(d -> d * d)
.sum();
return Math.sqrt(magnitude) / behaviorVector.size();
}
private String generateUserProfile(List<UserBehaviorTarget> history) {
// 分析用户行为模式
Map<String, Long> behaviorPattern = history.stream()
.collect(Collectors.groupingBy(
UserBehaviorTarget::getBehaviorType,
Collectors.counting()));
Map<String, Long> categoryPreference = history.stream()
.collect(Collectors.groupingBy(
b -> String.valueOf(b.getCategoryId()),
Collectors.counting()));
String prompt = String.format(
"基于用户行为模式 %s 和类别偏好 %s,生成简洁的用户画像描述(50字以内)",
behaviorPattern, categoryPreference);
ChatResponse response = chatClient.call(new Prompt(prompt));
return response.getResult().getOutput().getContent();
}
public List<String> generateRecommendations(Long userId,
List<UserBehaviorTarget> history) {
// 提取用户偏好
List<String> categories = history.stream()
.map(b -> String.valueOf(b.getCategoryId()))
.distinct()
.limit(5)
.collect(Collectors.toList());
List<String> purchaseHistory = history.stream()
.filter(b -> "PURCHASE".equals(b.getBehaviorType()))
.map(b -> String.valueOf(b.getProductId()))
.distinct()
.limit(10)
.collect(Collectors.toList());
String behaviorPattern = analyzeBehaviorPattern(history);
// 使用提示模板生成推荐
Map<String, Object> variables = Map.of(
"userId", userId,
"categories", String.join(", ", categories),
"purchaseHistory", String.join(", ", purchaseHistory),
"behaviorPattern", behaviorPattern
);
Prompt prompt = recommendationPromptTemplate.create(variables);
ChatResponse response = chatClient.call(prompt);
return parseRecommendations(response.getResult().getOutput().getContent());
}
private String analyzeBehaviorPattern(List<UserBehaviorTarget> history) {
// 分析时间模式
Map<Integer, Long> hourPattern = history.stream()
.collect(Collectors.groupingBy(
b -> b.getTimestamp().getHour(),
Collectors.counting()));
// 分析设备偏好
Map<String, Long> devicePattern = history.stream()
.collect(Collectors.groupingBy(
UserBehaviorTarget::getDeviceType,
Collectors.counting()));
return String.format("活跃时段: %s, 设备偏好: %s",
hourPattern, devicePattern);
}
private List<String> parseRecommendations(String response) {
// 解析AI生成的推荐内容
return Arrays.stream(response.split("\n"))
.filter(line -> line.matches("\\d+\\..*"))
.map(line -> line.replaceFirst("\\d+\\.", "").trim())
.collect(Collectors.toList());
}
}
8. Spring Batch配置
java
@Configuration
@EnableBatchProcessing
public class BatchConfig {
@Autowired
@Qualifier("sourceDataSource")
private DataSource sourceDataSource;
@Autowired
private MigrationService migrationService;
@Bean
public Job migrationJob(JobRepository jobRepository, Step migrationStep) {
return new JobBuilder("migrationJob", jobRepository)
.incrementer(new RunIdIncrementer())
.flow(migrationStep)
.end()
.listener(new JobExecutionListener() {
@Override
public void beforeJob(JobExecution jobExecution) {
log.info("开始执行迁移任务: {}", jobExecution.getJobId());
}
@Override
public void afterJob(JobExecution jobExecution) {
log.info("迁移任务完成: {}, 状态: {}",
jobExecution.getJobId(), jobExecution.getStatus());
}
})
.build();
}
@Bean
public Step migrationStep(JobRepository jobRepository,
PlatformTransactionManager transactionManager) {
return new StepBuilder("migrationStep", jobRepository)
.<UserBehavior, UserBehaviorTarget>chunk(10000, transactionManager)
.reader(userBehaviorReader())
.processor(userBehaviorProcessor())
.writer(userBehaviorWriter())
.faultTolerant()
.skipLimit(1000)
.skip(Exception.class)
.build();
}
@Bean
@StepScope
public JdbcCursorItemReader<UserBehavior> userBehaviorReader() {
return new JdbcCursorItemReaderBuilder<UserBehavior>()
.name("userBehaviorReader")
.dataSource(sourceDataSource)
.sql("SELECT * FROM user_behavior ORDER BY id")
.rowMapper(new BeanPropertyRowMapper<>(UserBehavior.class))
.fetchSize(10000)
.build();
}
@Bean
public UserBehaviorProcessor userBehaviorProcessor() {
return new UserBehaviorProcessor();
}
@Bean
public UserBehaviorWriter userBehaviorWriter() {
return new UserBehaviorWriter();
}
}
@Component
public class UserBehaviorProcessor implements ItemProcessor<UserBehavior, UserBehaviorTarget> {
@Autowired
private AIAnalysisService aiAnalysisService;
@Override
public UserBehaviorTarget process(UserBehavior item) throws Exception {
UserBehaviorTarget target = new UserBehaviorTarget();
// 基本字段映射
BeanUtils.copyProperties(item, target);
target.setBehaviorType(item.getBehaviorType().name());
// AI增强处理
Map<String, Object> insights = aiAnalysisService.analyzeUserBehavior(target);
target.setBehaviorInsights(insights);
target.setRelevanceScore((Double) insights.get("relevanceScore"));
target.setUserProfile((String) insights.get("userProfile"));
return target;
}
}
@Component
public class UserBehaviorWriter implements ItemWriter<UserBehaviorTarget> {
@Autowired
private MongoTemplate mongoTemplate;
@Autowired
private RedisTemplate<String, Object> redisTemplate;
@Override
public void write(List<? extends UserBehaviorTarget> items) throws Exception {
// 批量写入MongoDB
mongoTemplate.insertAll(items);
// 更新Redis缓存
for (UserBehaviorTarget item : items) {
String key = "user_behavior:" + item.getUserId();
redisTemplate.opsForList().rightPush(key, item);
redisTemplate.expire(key, Duration.ofHours(24));
}
}
}
9. 主启动类和控制器
java
@SpringBootApplication
@EnableScheduling
@EnableAsync
public class DataMigrationApplication {
public static void main(String[] args) {
SpringApplication.run(DataMigrationApplication.class, args);
}
}
@RestController
@RequestMapping("/api/migration")
@Slf4j
public class MigrationController {
@Autowired
private MigrationService migrationService;
@Autowired
private AIAnalysisService aiAnalysisService;
@PostMapping("/start")
public ResponseEntity<Map<String, Object>> startMigration() {
try {
migrationService.startMigration();
Map<String, Object> response = new HashMap<>();
response.put("status", "started");
response.put("timestamp", LocalDateTime.now());
response.put("message", "数据迁移任务已启动");
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("启动迁移任务失败", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
.body(Map.of("status", "error", "message", e.getMessage()));
}
}
@GetMapping("/recommendations/{userId}")
public ResponseEntity<List<String>> getRecommendations(@PathVariable Long userId) {
try {
// 获取用户最近行为
Query query = new Query(Criteria.where("userId").is(userId));
query.limit(100);
List<UserBehaviorTarget> history = mongoTemplate.find(
query, UserBehaviorTarget.class);
List<String> recommendations = aiAnalysisService
.generateRecommendations(userId, history);
return ResponseEntity.ok(recommendations);
} catch (Exception e) {
log.error("生成推荐失败", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
.body(List.of("推荐服务暂时不可用"));
}
}
}
10. 配置文件
yaml
# application.yml
spring:
datasource:
source:
jdbc-url: jdbc:mysql://source-db:3306/ecommerce?useSSL=false&serverTimezone=UTC
username: source_user
password: source_password
driver-class-name: com.mysql.cj.jdbc.Driver
target:
jdbc-url: jdbc:mysql://target-db:3306/ecommerce_new?useSSL=false&serverTimezone=UTC
username: target_user
password: target_password
driver-class-name: com.mysql.cj.jdbc.Driver
data:
mongodb:
uri: mongodb://mongo-cluster:27017/user_behavior
redis:
host: redis-cluster
port: 6379
timeout: 2000ms
lettuce:
pool:
max-active: 100
max-idle: 10
min-idle: 5
ai:
openai:
api-key: ${OPENAI_API_KEY}
base-url: https://api.openai.com
chat:
options:
model: gpt-3.5-turbo
temperature: 0.7
embedding:
options:
model: text-embedding-ada-002
batch:
job:
enabled: false
jdbc:
initialize-schema: always
task:
execution:
pool:
core-size: 10
max-size: 50
queue-capacity: 100
logging:
level:
com.example: DEBUG
org.springframework.batch: INFO
org.springframework.ai: DEBUG
management:
endpoints:
web:
exposure:
include: health,info,metrics,prometheus
endpoint:
health:
show-details: always
metrics:
export:
prometheus:
enabled: true
11. 性能优化和监控
java
@Component
@Slf4j
public class MigrationMonitor {
private final MeterRegistry meterRegistry;
private final Counter successCounter;
private final Counter errorCounter;
private final Timer processingTimer;
public MigrationMonitor(MeterRegistry meterRegistry) {
this.meterRegistry = meterRegistry;
this.successCounter = Counter.builder("migration.success")
.description("Successful migration count")
.register(meterRegistry);
this.errorCounter = Counter.builder("migration.error")
.description("Failed migration count")
.register(meterRegistry);
this.processingTimer = Timer.builder("migration.processing.time")
.description("Processing time per batch")
.register(meterRegistry);
}
@EventListener
public void handleBatchSuccess(BatchSuccessEvent event) {
successCounter.increment(event.getRecordCount());
log.info("批次处理成功: {} 条记录", event.getRecordCount());
}
@EventListener
public void handleBatchError(BatchErrorEvent event) {
errorCounter.increment();
log.error("批次处理失败: {}", event.getError().getMessage());
}
@Scheduled(fixedRate = 60000) // 每分钟输出一次统计信息
public void logStatistics() {
double successRate = successCounter.count();
double errorRate = errorCounter.count();
double totalRate = successRate + errorRate;
if (totalRate > 0) {
log.info("迁移统计 - 成功: {}, 失败: {}, 成功率: {:.2f}%",
(long)successRate, (long)errorRate,
(successRate / totalRate) * 100);
}
}
}
这个完整的方案展示了:
- 分布式架构:使用多数据源、MongoDB、Redis等
- AI集成:结合Spring AI进行用户行为分析和推荐
- 批处理优化:使用Spring Batch处理大量数据
- 性能监控:集成Micrometer进行指标收集
- 错误处理:完善的异常处理和重试机制