关于大事务的max_binlog_cache_size调大

2张表的表结构如下:
show create table dt_dim_goods_base_info\G;
*************************** 1. row ***************************
       Table: dt_dim_goods_base_info
Create Table: CREATE TABLE `dt_dim_goods_base_info` (
  `id` int(19) NOT NULL AUTO_INCREMENT COMMENT 'id',
  `rid` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT 'key  goods_id_dateid',
  `goods_id` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '宝贝ID  该商城是唯一的',
  `goods_name` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '商品标题',
  `goods_ad` text COLLATE utf8_bin COMMENT '商品广告',
  `goods_show_price` decimal(38,4) DEFAULT NULL COMMENT '商品定价',
  `goods_sale_price` decimal(38,4) DEFAULT NULL COMMENT '商品真实销售价格',
  `goods_shop_id` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '店铺id',
  `goods_shop_name` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '店铺名称',
  `brand_id` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '品牌id',
  `brand_name` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '品牌name',
  `goods_main_pic` varchar(2000) COLLATE utf8_bin DEFAULT NULL COMMENT '主图片地址',
  `goods_url` varchar(2000) COLLATE utf8_bin DEFAULT NULL COMMENT '商品的url地址',
  `dateid` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '时间',
  `industry_category_id` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '行业分类ID',
  `root_category_id` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '顶层行业分类ID',
  `data_souce_type` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT 'tmall',
  PRIMARY KEY (`id`),
  KEY `goods_id` (`goods_id`)
) ENGINE=InnoDB AUTO_INCREMENT=3473356 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='商品基本信息'
1 row in set (0.00 sec)
show create table dt_dim_goods_base_info_etl\G;
*************************** 1. row ***************************
       Table: dt_dim_goods_base_info_etl
Create Table: CREATE TABLE `dt_dim_goods_base_info_etl` (
  `id` int(19) NOT NULL AUTO_INCREMENT COMMENT 'id',
  `rid` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT 'key  goods_id_dateid',
  `goods_id` varchar(100) COLLATE utf8_bin NOT NULL COMMENT '宝贝ID  该商城是唯一的',
  `goods_name` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '商品标题',
  `goods_ad` text COLLATE utf8_bin COMMENT '商品广告',
  `goods_show_price` decimal(38,4) DEFAULT NULL COMMENT '商品定价',
  `goods_sale_price` decimal(38,4) DEFAULT NULL COMMENT '商品真实销售价格',
  `goods_shop_id` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '店铺id',
  `goods_shop_name` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '店铺名称',
  `brand_id` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '品牌id',
  `brand_name` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '品牌name',
  `goods_main_pic` varchar(2000) COLLATE utf8_bin DEFAULT NULL COMMENT '主图片地址',
  `goods_url` varchar(2000) COLLATE utf8_bin DEFAULT NULL COMMENT '商品的url地址',
  `dateid` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '时间',
  `industry_category_id` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '行业分类ID',
  `root_category_id` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '顶层行业分类ID',
  `data_souce_type` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT 'tmall',
  PRIMARY KEY (`id`),
  UNIQUE KEY `goods_id` (`goods_id`)
) ENGINE=InnoDB AUTO_INCREMENT=13837589 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='商品基本信息表B'
2张表的数据 一个在320万 一个在350万
调用存储过程如下:
DELIMITER //
CREATE DEFINER=`root`@`%` PROCEDURE `sp_dim_goods_base_info`()
    SQL SECURITY INVOKER
BEGIN
        
        call sp_ddd_dim_goods_base_info();
        
        UPDATE dt_dim_goods_base_info_etl a INNER JOIN dt_dim_goods_base_info b on a.goods_id=b.goods_id
        SET
            a.rid=b.rid ,
            a.goods_name=b.goods_name ,
            a.goods_ad=b.goods_ad ,
            a.goods_show_price=b.goods_show_price,
            a.goods_sale_price=b.goods_sale_price ,
            a.goods_shop_id=b.goods_shop_id ,
            a.goods_shop_name=b.goods_shop_name ,
            a.brand_id=b.brand_id ,
            a.brand_name=b.brand_name ,
            a.goods_main_pic=b.goods_main_pic ,
            a.goods_url=b.goods_url ,
            a.dateid=b.dateid ,
            a.industry_category_id=b.industry_category_id,
            a.root_category_id=b.root_category_id,
            a.data_souce_type=b.data_souce_type
        where a.goods_id=b.goods_id;
        insert into dt_dim_goods_base_info_etl(
            rid,
            goods_id,
            goods_name,
            goods_ad,
            goods_show_price,
            goods_sale_price,
            goods_shop_id,
            goods_shop_name,
            brand_id,
            brand_name,
            goods_main_pic,
            goods_url,
            dateid,
            industry_category_id,
            root_category_id,
            data_souce_type
        )
        SELECT
            b.rid,
            b.goods_id,
            b.goods_name,
            b.goods_ad,
            b.goods_show_price,
            b.goods_sale_price,
            b.goods_shop_id,
            b.goods_shop_name,
            b.brand_id,
            b.brand_name,
            b.goods_main_pic,
            b.goods_url,
            b.dateid,
            b.industry_category_id,
            b.root_category_id,
            b.data_souce_type
        FROM
            dt_dim_goods_base_info b LEFT  JOIN dt_dim_goods_base_info_etl a on a.goods_id=b.goods_id
        where a.goods_id is null;
        end//
DELIMITER ;
星期五开发说这个存储过程执行报错,提示max_binlog_cache_size过小,当时这个值默认为2G。后来不停放大这个值到9G这个存储过程才得以执行。看到这2个表均有text列,我初步的想法是把这个text列和2个varchar(2000)的列拆分出去。第2个想法是把这个insert+update操作换成replace into或者insert  into ...on duplicate key update...现在疑问有3个 放大max_binlog_cache_size到这么大可不可行。。因为看官档上说 The maximum recommended value is 4GB; this is due to the fact that MySQL currently cannot work with binary log positions greater than 4GB.  会不会有什么不好的影响?第2 replace inot和 insert into ..on duplicate key update 哪个效率会更好一点?第3  因为这是个爬虫大数据项目,数据量增加很快,对这个更新+插入还有没有什么更好的方法?
已邀请:

oracletomysql - Oracle to MySQL

赞同来自:

1.将待update/insert的结果集,分别放到2个中间表;2.根据中间表,小批量update/insert并提交(建议2000条提交一次);意思是将大事务,拆分成多个小事务,并及时提交;(通过游标处理也可以,但效率不如上面的方法)

要回复问题请先登录注册