流程:
1.DrissionPage采集知音漫客约2万条国产动漫数据存入mysql数据库;
2.Mapreduce对采集的动漫数据进行数据清洗、拆分数据项等,转为.csv文件上传hadoop的hdfs集群;
3.hive建库建表导入.csv动漫数据;
4.一半指标使用hive_sql分析得出,一半指标使用Spark之Scala完成;
5.sqoop对分析结果导入mysql数据库;
6.Flask+echarts搭建可视化大屏;
创新点:Python全新DrissionPage爬虫使用、海量数据、爬虫、可视化大屏、离线hive+实时Spark双实现
可选装:推荐系统、预测系统、知识图谱、后台管理等。
核心算法代码分享如下:
sql
/*
Navicat MySQL Data Transfer
Source Server : Win7本地测试_localhost_3306_123456_版本5.7
Source Server Version : 50714
Source Host : localhost:3306
Source Database : hive_zymk
Target Server Type : MYSQL
Target Server Version : 50714
File Encoding : 65001
Date: 2023-08-30 11:43:34
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for tb_zymk
-- ----------------------------
DROP TABLE IF EXISTS `tb_zymk`;
CREATE TABLE `tb_zymk` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` text COLLATE utf8mb4_bin COMMENT '漫画名称',
`update_times` text COLLATE utf8mb4_bin COMMENT '每周更新时间',
`tags` text COLLATE utf8mb4_bin COMMENT '标签',
`content` text COLLATE utf8mb4_bin COMMENT '内容',
`readings` text COLLATE utf8mb4_bin COMMENT '阅读量',
`subscribes` text COLLATE utf8mb4_bin COMMENT '订阅量',
`rewards` text COLLATE utf8mb4_bin COMMENT '打赏',
`monthtickets` text COLLATE utf8mb4_bin COMMENT '月票',
`recommends` text COLLATE utf8mb4_bin COMMENT '推荐次数',
`comments` text COLLATE utf8mb4_bin COMMENT '评论量',
`scores` text COLLATE utf8mb4_bin COMMENT '评分',
`author` text COLLATE utf8mb4_bin COMMENT '作者',
`zps` text COLLATE utf8mb4_bin COMMENT '代表作',
`ctime` text COLLATE utf8mb4_bin COMMENT '章节最后更新时间',
`img` text COLLATE utf8mb4_bin,
`url` text COLLATE utf8mb4_bin,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1437 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;