admin管理员组文章数量:1568354
使用mysql event解决千万级别数据多表JOIN统计的需求
背景
公司某条业务线存在大量的统计汇总需求,统计范围涉及到十几张表,表数据从200万到1亿不等。里面每个字段都涉及到各种条件查询,排序和分页。由于是接手其他人的代码,所以最初思路还是从MySql逻辑和索引结构上去调整。
原始SQL示例
SELECT
smallCar.device_id,
smallCar.device_name,
bigCar.bigCarTotal,
smallCar.smallCarTotal,
hcCar.hcCarTotal,
allCarsTotal.allCarTotal,
nonCarList.nonCarFlowTotal,
eventAllList.nonEventAllTotal,
FFCREvent.nonFFCRAllTotal,
WDTKEvent.nonWETKAllTotal,
NXSLEvent.nonNXSLlTotal,
QCDREvent.nonQCDRTotal
FROM
(
SELECT
COUNT(*) AS smallCarTotal,
device_id,
device_name
FROM
vehicle_flow_record_partition
WHERE
vehicle_type = '小客车'
AND UNIX_TIMESTAMP( start_time ) > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND UNIX_TIMESTAMP( start_time ) < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS smallCar
LEFT JOIN (
SELECT
COUNT(*) AS bigCarTotal,
device_id,
device_name
FROM
vehicle_flow_record_partition AS bigCar
WHERE
vehicle_type = '中大客车'
AND UNIX_TIMESTAMP( start_time ) > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND UNIX_TIMESTAMP( start_time ) < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS bigCar ON smallCar.device_id = bigCar.device_id
LEFT JOIN (
SELECT
COUNT(*) AS hcCarTotal,
device_id
FROM
vehicle_flow_record_partition
WHERE
vehicle_type = '货车'
AND UNIX_TIMESTAMP( start_time ) > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND UNIX_TIMESTAMP( start_time ) < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id
) AS hcCar ON hcCar.device_id = smallCar.device_id
LEFT JOIN (
SELECT
COUNT(*) AS allCarTotal,
device_id,
device_name
FROM
vehicle_flow_record_partition
WHERE
UNIX_TIMESTAMP( start_time ) > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND UNIX_TIMESTAMP( start_time ) < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS allCarsTotal ON allCarsTotal.device_id = smallCar.device_id
LEFT JOIN (
SELECT
COUNT(*) AS nonCarFlowTotal,
device_id,
device_name
FROM
non_motor_vehicle_flow_record_partition
WHERE
UNIX_TIMESTAMP( start_time ) > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND UNIX_TIMESTAMP( start_time ) < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS nonCarList ON nonCarList.device_id = allCarsTotal.device_id
LEFT JOIN (
SELECT
COUNT(*) AS nonEventAllTotal,
device_id,
device_name
FROM
event_list
WHERE
start_time > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND start_time < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS eventAllList ON eventAllList.device_id = allCarsTotal.device_id
LEFT JOIN (
SELECT
COUNT(*) AS nonFFCRAllTotal,
device_id,
device_name
FROM
event_list
WHERE
event_type = 1
AND start_time > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND start_time < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS FFCREvent ON FFCREvent.device_id = allCarsTotal.device_id
LEFT JOIN (
SELECT
COUNT(*) AS nonWETKAllTotal,
device_id,
device_name
FROM
event_list
WHERE
event_type = 2
AND start_time > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND start_time < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS WDTKEvent ON WDTKEvent.device_id = FFCREvent.device_id
LEFT JOIN (
SELECT
COUNT(*) AS nonNXSLlTotal,
device_id,
device_name
FROM
event_list
WHERE
event_type = 3
AND start_time > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND start_time < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS NXSLEvent ON NXSLEvent.device_id = FFCREvent.device_id
LEFT JOIN (
SELECT
COUNT(*) AS nonQCDRTotal,
device_id,
device_name
FROM
event_list
WHERE
event_type = 4
AND start_time > (
UNIX_TIMESTAMP(
DATE_ADD( CURDATE(), INTERVAL 0 HOUR )) - 86400
)
AND start_time < ( UNIX_TIMESTAMP( NOW()) - 86400 )
GROUP BY
device_id,
device_name
) AS QCDREvent ON QCDREvent.device_id = NXSLEvent.device_id
原始SQL大概在190行左右,存在大量的join和where,看到这个sql我心里就凉了。在多张表上亿的数据量下,基本上不可能考索引进行优化。但是我还是尝试了这种方式,最终优化的效果也不理想,所以直接废弃这部分逻辑直接重写。
重构过程
分析业务
由于我是被从其他项目组临时借调过来帮忙,所以并不懂这方便的业务逻辑,用了接近一天的时间了解了一下所有涉及到统计的需求,包括以下几个方面(涉及到公司核心业务,这部分不具体描述,只说一个思路)
- 这部分数据要描述的业务场景是什么?
- 客户对数据的实时变化感知是否明显?
- 是否要求数据实时流处理?
- 客户现场的硬件配置和部署方案
- 现有的项目结构***(这是我一个原则,尽量不去破坏原有的规则,在原有的规则上层进行拓展优化或重新开启新规则)***
- 交付时间点
确定重构方案
给予产品和研发反馈的信息
- 确认客户要求可以定时查看数据的日月同比环比和统计数据
- 有数据敏感度但对实时性要求不高
- 允许小范围的丢失数据
- 对异常数据要求必须过滤(比如车牌号为****这种)
基于上面的思考,首先想到是否可以将数据先计算好,丢到缓存中,定时更新缓存,用户查询数据时,直接从缓存中查询。但是存在缓存中会有一个核心问题。
- 增加了系统的复杂性
- 破坏了原有的技术架构
无论是增加系统复杂性还是破坏原有技术架构,都会造成一堆新坑出来,填坑的成本一般来说是远远大于开发成本的。所以我又重新去调研。客户对这部分数据的需求是什么。最多需要检索多少天,会不会有查询原始数据的需求。最终确认的结果是
统计数据最多两年
不会查原始数据(也就意味着原始数据在汇总统计后就可以丢弃)
再次确认后,我在想用MySQL是否可以解决这样的业务场景。
于是去看了官方文档。发现了一个好东西叫MySQL Event。感兴趣的自行去看官方文档。
重构后的方案
具体的思路是使用Join统计数据,由于业务需求,分别统计每分钟和每天的数据,用两张表在分别保存,
- 考虑到数据一致性和性能的问题,每天的数据不再单独从原始表中统计,分钟表从原始业务表计算,天数表从分钟表计算。
- 使用游标解决数据增量问题
- 解决数据延迟上传问题
- 解决原始表数据量导致磁盘爆满问题,定时删除原始表数据(好的做法是备份原始表数据存到单独的数据盘(分区表),防止客户某天想到要看原始数据)
USE jwgateway;
CREATE TABLE IF NOT EXISTS mv_road_traffic_analysis_minu
(
update_time datetime,
device_id varchar(32) not null default '',
device_name varchar(256) not null default '',
bizdate datetime,
vehicle_cnt bigint not null default 0,
car_count bigint not null default 0,
bus_count bigint not null default 0,
van_count bigint not null default 0,
unvehicle_cnt bigint not null default 0,
event_cnt bigint not null default 0,
into_drivingarea_cnt bigint not null default 0,
without_helmet_cnt bigint not null default 0,
nmve_converse_cnt bigint not null default 0,
carry_people_cnt bigint not null default 0,
UNIQUE KEY uk_time_device (bizdate, device_id),
KEY idx_device (device_id, bizdate)
) ENGINE = INNODB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE IF NOT EXISTS mv_road_traffic_analysis_date LIKE mv_road_traffic_analysis_minu;
DROP PROCEDURE IF EXISTS road_traffic_analysis;
DROP PROCEDURE IF EXISTS clean_expire_traffic;
DROP EVENT IF EXISTS event_road_traffic_analysis;
DELIMITER //
CREATE PROCEDURE road_traffic_analysis()
BEGIN
-- 增量统计游标
DECLARE minute_table_offset DATETIME DEFAULT '1970-01-01 00:00:00';
-- 考虑数据延迟上传的情况
SELECT DATE_SUB(MAX(bizdate), INTERVAL 1 HOUR) INTO minute_table_offset FROM mv_road_traffic_analysis_minu;
REPLACE INTO mv_road_traffic_analysis_minu
SELECT NOW()
, s.device_id
, IFNULL(LEFT(s.device_name, 256), '') AS device_name
, s.bizdate
, IFNULL(a.vehicle_cnt, 0) AS vehicle_cnt
, IFNULL(a.car_count, 0) AS car_count
, IFNULL(a.bus_count, 0) AS bus_count
, IFNULL(a.van_count, 0) AS van_count
, IFNULL(b.unvehicle_cnt, 0) AS unvehicle_cnt
, IFNULL(c.event_cnt, 0) AS event_cnt
, IFNULL(c.into_drivingarea_cnt, 0) AS into_drivingarea_cnt
, IFNULL(c.without_helmet_cnt, 0) AS without_helmet_cnt
, IFNULL(c.nmve_converse_cnt, 0) AS nmve_converse_cnt
, IFNULL(c.carry_people_cnt, 0) AS carry_people_cnt
FROM (
SELECT bizdate
, device_id
, GROUP_CONCAT(DISTINCT device_name) AS device_name
FROM (
SELECT DATE_FORMAT(start_time, '%Y-%m-%d %H:%i:00') AS bizdate
, device_id
, IF(LENGTH(device_name) > 0, device_name, NULL) AS device_name
FROM vehicle_flow_record
WHERE start_time >= minute_table_offset
UNION
SELECT DATE_FORMAT(start_time, '%Y-%m-%d %H:%i:00') AS bizdate
, device_id
, IF(LENGTH(device_name) > 0, device_name, NULL) AS device_name
FROM non_motor_vehicle_flow_record
WHERE start_time >= minute_table_offset
UNION
SELECT FROM_UNIXTIME(start_time, '%Y-%m-%d %H:%i:00') AS bizdate
, device_id
, IF(LENGTH(device_name) > 0, device_name, NULL) AS device_name
FROM event_list
WHERE start_time >= UNIX_TIMESTAMP(minute_table_offset)
) p
GROUP BY bizdate, device_id
) s
LEFT JOIN (
SELECT DATE_FORMAT(start_time, '%Y-%m-%d %H:%i:00') AS bizdate
, device_id
, COUNT(*) AS vehicle_cnt
, SUM(IF(vehicle_type = '小客车', 1, 0)) AS car_count
, SUM(IF(vehicle_type = '中大客车', 1, 0)) AS bus_count
, SUM(IF(vehicle_type = '货车', 1, 0)) AS van_count
FROM vehicle_flow_record
WHERE start_time >= minute_table_offset
GROUP BY device_id, bizdate
) a ON s.device_id = a.device_id AND s.bizdate = a.bizdate
LEFT JOIN (
SELECT DATE_FORMAT(start_time, '%Y-%m-%d %H:%i:00') AS bizdate
, device_id
, COUNT(*) AS unvehicle_cnt
FROM non_motor_vehicle_flow_record
WHERE start_time >= minute_table_offset
GROUP BY device_id, bizdate
) b ON s.device_id = b.device_id AND s.bizdate = b.bizdate
LEFT JOIN (
SELECT FROM_UNIXTIME(start_time, '%Y-%m-%d %H:%i:00') AS bizdate
, device_id
, COUNT(*) AS event_cnt
, SUM(IF(alarm_type = 'into_drivingarea', 1, 0)) AS into_drivingarea_cnt
, SUM(IF(alarm_type = 'without_helmet', 1, 0)) AS without_helmet_cnt
, SUM(IF(alarm_type = 'nmve_converse', 1, 0)) AS nmve_converse_cnt
, SUM(IF(alarm_type = 'carry_people', 1, 0)) AS carry_people_cnt
FROM event_list
WHERE start_time >= UNIX_TIMESTAMP(minute_table_offset)
AND obejct_class = '非机动车'
GROUP BY device_id, bizdate
) c ON s.device_id = c.device_id AND s.bizdate = c.bizdate;
REPLACE INTO mv_road_traffic_analysis_date
SELECT MAX(update_time) AS update_time
, device_id
, MAX(device_name) AS device_name
, DATE_FORMAT(bizdate, '%Y-%m-%d') AS stdate
, SUM(vehicle_cnt) AS vehicle_cnt
, SUM(car_count) AS car_count
, SUM(bus_count) AS bus_count
, SUM(van_count) AS van_count
, SUM(unvehicle_cnt) AS unvehicle_cnt
, SUM(event_cnt) AS event_cnt
, SUM(into_drivingarea_cnt) AS into_drivingarea_cnt
, SUM(without_helmet_cnt) AS without_helmet_cnt
, SUM(nmve_converse_cnt) AS nmve_converse_cnt
, SUM(carry_people_cnt) AS carry_people_cnt
FROM mv_road_traffic_analysis_minu
GROUP BY device_id, stdate;
END //
CREATE PROCEDURE clean_expire_traffic()
BEGIN
DECLARE detail_expire_days INT DEFAULT 11;
DELETE FROM event_list WHERE start_time < UNIX_TIMESTAMP() - detail_expire_days * 86400;
DELETE FROM vehicle_flow_record WHERE start_time < DATE_SUB(NOW(), INTERVAL detail_expire_days DAY);
DELETE
FROM non_motor_vehicle_flow_record
WHERE start_time < DATE_SUB(NOW(), INTERVAL detail_expire_days DAY);
END //
CREATE EVENT event_road_traffic_analysis
ON SCHEDULE EVERY 1 MINUTE
ON COMPLETION PRESERVE
DO BEGIN
CALL road_traffic_analysis();
CALL clean_expire_traffic();
END //
DELIMITER ;
测试
具体测试不做了。 天数表最多2356条数据。分钟表称2365*86400条数据,加了索引不会再存在性能问题。
版权声明:本文标题:使用mysql event解决亿级别数据多表JOIN统计的需求 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:https://m.elefans.com/dongtai/1727613759a1123021.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论