赞
踩
HBase 海量存储案例 预分区表
默认创建表的方式,则HBase顺序写入可能会受到RegionServer热点的影响。对行键进行加盐可以解决热点问题。在HBase中,可以使用两种方式:
1.ROWKEY预分区
2.加盐指定数量分区
1.1.1 ROWKEY预分区
按照用户ID来分区,一共4个分区。并指定数据的压缩格式为GZ。
drop table if exists ORDER_DTL;
create table if not exists ORDER_DTL(
"id" varchar primary key,
C1."status" varchar,
C1."money" float,
C1."pay_way" integer,
C1."user_id" varchar,
C1."operation_time" varchar,
C1."category" varchar
)
CONPRESSION='GZ'
SPLIT ON ('3','5','7');
hbase shell 样例
create 'mytable', {NAME=>'cf', VERSIONS=>1, BLOOMFILTER=>'ROW', COMPRESSION=>'SNAPPY', DATA_BLOCK_ENCODING=>'FAST_DIFF', TTL => '604800'}, SPLITS => ['10', '20', '30']
我们尝试往表中插入一些数据,然后去HBase中查看数据的分布情况。
UPSERT INTO "ORDER_DTL" VALUES('02602f66-adc7-40d4-8485-76b5632b5b53','已提交',4070,1,'4944191','2020-04-25 12:09:16','手机;');
UPSERT INTO "ORDER_DTL" VALUES('0968a418-f2bc-49b4-b9a9-2157cf214cfd','已完成',4350,1,'1625615','2020-04-25 12:09:37','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('0e01edba-5e55-425e-837a-7efb91c56630','已提交',6370,3,'3919700','2020-04-25 12:09:39','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('0e01edba-5e55-425e-837a-7efb91c56630','已付款',6370,3,'3919700','2020-04-25 12:09:44','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('0f46d542-34cb-4ef4-b7fe-6dcfa5f14751','已提交',9380,1,'2993700','2020-04-25 12:09:41','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('0f46d542-34cb-4ef4-b7fe-6dcfa5f14751','已付款',9380,1,'2993700','2020-04-25 12:09:46','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('1fb7c50f-9e26-4aa8-a140-a03d0de78729','已完成',6400,2,'5037058','2020-04-25 12:10:13','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('23275016-996b-420c-8edc-3e3b41de1aee','已付款',280,1,'3018827','2020-04-25 12:09:53','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('2375a7cf-c206-4ac0-8de4-863e7ffae27b','已完成',5600,1,'6489579','2020-04-25 12:08:55','食品;家用电器;');
UPSERT INTO "ORDER_DTL" VALUES('2375a7cf-c206-4ac0-8de4-863e7ffae27b','已付款',5600,1,'6489579','2020-04-25 12:09:00','食品;家用电器;');
UPSERT INTO "ORDER_DTL" VALUES('269fe10c-740b-4fdb-ad25-7939094073de','已提交',8340,2,'2948003','2020-04-25 12:09:26','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('269fe10c-740b-4fdb-ad25-7939094073de','已付款',8340,2,'2948003','2020-04-25 12:09:30','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('2849fa34-6513-44d6-8f66-97bccb3a31a1','已提交',7060,2,'2092774','2020-04-25 12:09:38','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('28b7e793-6d14-455b-91b3-0bd8b23b610c','已提交',640,3,'7152356','2020-04-25 12:09:49','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('28b7e793-6d14-455b-91b3-0bd8b23b610c','已付款',9410,3,'7152356','2020-04-25 12:10:01','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('2909b28a-5085-4f1d-b01e-a34fbaf6ce37','已提交',9390,3,'8237476','2020-04-25 12:10:08','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('2a01dfe5-f5dc-4140-b31b-a6ee27a6e51e','已提交',7490,2,'7813118','2020-04-25 12:09:05','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('2a01dfe5-f5dc-4140-b31b-a6ee27a6e51e','已付款',7490,2,'7813118','2020-04-25 12:09:06','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('2b86ab90-3180-4940-b624-c936a1e7568d','已付款',5360,2,'5301038','2020-04-25 12:08:50','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('2b86ab90-3180-4940-b624-c936a1e7568d','已提交',5360,2,'5301038','2020-04-25 12:08:53','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('2b86ab90-3180-4940-b624-c936a1e7568d','已取消',5360,2,'5301038','2020-04-25 12:08:58','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('2e19fbe8-7970-4d62-8e8f-d364afc2dd41','已付款',6490,0,'3141181','2020-04-25 12:09:22','食品;家用电器;');
UPSERT INTO "ORDER_DTL" VALUES('2fc28d36-dca0-49e8-bad0-42d0602bdb40','已付款',3820,1,'9054826','2020-04-25 12:10:04','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('31477850-8b15-4f1b-9ec3-939f7dc47241','已提交',4650,2,'5837271','2020-04-25 12:08:52','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('31477850-8b15-4f1b-9ec3-939f7dc47241','已付款',4650,2,'5837271','2020-04-25 12:08:57','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('39319322-2d80-41e7-a862-8b8858e63316','已提交',5000,1,'5686435','2020-04-25 12:08:51','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('39319322-2d80-41e7-a862-8b8858e63316','已完成',5000,1,'5686435','2020-04-25 12:08:56','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('3d2254bd-c25a-404f-8e42-2faa4929a629','已提交',5000,3,'1274270','2020-04-25 12:08:41','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('3d2254bd-c25a-404f-8e42-2faa4929a629','已付款',5000,3,'1274270','2020-04-25 12:08:42','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('3d2254bd-c25a-404f-8e42-2faa4929a629','已完成',5000,1,'1274270','2020-04-25 12:08:43','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('42f7fe21-55a3-416f-9535-baa222cc0098','已完成',3600,2,'2661641','2020-04-25 12:09:58','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('44231dbb-9e58-4f1a-8c83-be1aa814be83','已提交',3950,1,'3855371','2020-04-25 12:08:39','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('44231dbb-9e58-4f1a-8c83-be1aa814be83','已付款',3950,1,'3855371','2020-04-25 12:08:40','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('526e33d2-a095-4e19-b759-0017b13666ca','已完成',3280,0,'5553283','2020-04-25 12:09:01','食品;家用电器;');
UPSERT INTO "ORDER_DTL" VALUES('5a6932f4-b4a4-4a1a-b082-2475d13f9240','已提交',50,2,'1764961','2020-04-25 12:10:07','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('5fc0093c-59a3-417b-a9ff-104b9789b530','已提交',6310,2,'1292805','2020-04-25 12:09:36','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('605c6dd8-123b-4088-a047-e9f377fcd866','已完成',8980,2,'6202324','2020-04-25 12:09:54','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('613cfd50-55c7-44d2-bb67-995f72c488ea','已完成',6830,3,'6977236','2020-04-25 12:10:06','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('62246ac1-3dcb-4f2c-8943-800c9216c29f','已提交',8610,1,'5264116','2020-04-25 12:09:14','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('62246ac1-3dcb-4f2c-8943-800c9216c29f','已付款',8610,1,'5264116','2020-04-25 12:09:18','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('625c7fef-de87-428a-b581-a63c71059b14','已提交',5970,0,'8051757','2020-04-25 12:09:07','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('625c7fef-de87-428a-b581-a63c71059b14','已付款',5970,0,'8051757','2020-04-25 12:09:19','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('6d43c490-58ab-4e23-b399-dda862e06481','已提交',4570,0,'5514248','2020-04-25 12:09:34','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('70fa0ae0-6c02-4cfa-91a9-6ad929fe6b1b','已付款',4100,1,'8598963','2020-04-25 12:09:08','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('7170ce71-1fc0-4b6e-a339-67f525536dcd','已完成',9740,1,'4816392','2020-04-25 12:09:51','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('7170ce71-1fc0-4b6e-a339-67f525536dcd','已提交',9740,1,'4816392','2020-04-25 12:10:03','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('71961b06-290b-457d-bbe0-86acb013b0e3','已付款',6550,3,'2393699','2020-04-25 12:08:47','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('71961b06-290b-457d-bbe0-86acb013b0e3','已付款',6550,3,'2393699','2020-04-25 12:08:48','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('71961b06-290b-457d-bbe0-86acb013b0e3','已完成',6550,3,'2393699','2020-04-25 12:08:49','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('72dc148e-ce64-432d-b99f-61c389cb82cd','已提交',4090,1,'2536942','2020-04-25 12:10:12','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('72dc148e-ce64-432d-b99f-61c389cb82cd','已付款',4090,1,'2536942','2020-04-25 12:10:14','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('7c0c1668-b783-413f-afc4-678a5a6d1033','已完成',3850,3,'6803936','2020-04-25 12:09:20','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('7fa02f7a-10df-4247-9935-94c8b7d4dbc0','已提交',1060,0,'6119810','2020-04-25 12:09:21','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('820c5e83-f2e0-42d4-b5f0-83802c75addc','已付款',9270,2,'5818454','2020-04-25 12:10:09','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('83ed55ec-a439-44e0-8fe0-acb7703fb691','已完成',8380,2,'6804703','2020-04-25 12:09:52','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('85287268-f139-4d59-8087-23fa6454de9d','已提交',9750,1,'4382852','2020-04-25 12:09:43','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('85287268-f139-4d59-8087-23fa6454de9d','已付款',9750,1,'4382852','2020-04-25 12:09:48','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('85287268-f139-4d59-8087-23fa6454de9d','已取消',9750,1,'4382852','2020-04-25 12:10:00','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('8d32669e-327a-4802-89f4-2e91303aee59','已提交',9390,1,'4182962','2020-04-25 12:09:57','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('8dadc2e4-63f1-490f-9182-793be64fed76','已付款',9350,1,'5937549','2020-04-25 12:09:02','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('94ad8ee0-8898-442c-8cb1-083a4b609616','已提交',4370,0,'4666456','2020-04-25 12:09:13','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('994cbb44-f0ee-45ff-a4f4-76c87bc2b972','已付款',3190,3,'3200759','2020-04-25 12:09:25','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('9bf92519-6eb3-449a-853b-0e19f6005887','已提交',1100,0,'3457528','2020-04-25 12:10:11','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('9ff3032c-8679-4247-9e6f-4caf2dc93aff','已提交',850,0,'8835231','2020-04-25 12:09:40','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('9ff3032c-8679-4247-9e6f-4caf2dc93aff','已付款',850,0,'8835231','2020-04-25 12:09:45','食品;家用电器;');
UPSERT INTO "ORDER_DTL" VALUES('a467ba42-f91e-48a0-865e-1703aaa45e0e','已提交',8040,0,'8206022','2020-04-25 12:09:50','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('a467ba42-f91e-48a0-865e-1703aaa45e0e','已付款',8040,0,'8206022','2020-04-25 12:10:02','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('a5302f47-96d9-41b4-a14c-c7a508f59282','已付款',8570,2,'5319315','2020-04-25 12:08:44','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('a5b57bec-6235-45f4-bd7e-6deb5cd1e008','已提交',5700,3,'6486444','2020-04-25 12:09:27','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('a5b57bec-6235-45f4-bd7e-6deb5cd1e008','已付款',5700,3,'6486444','2020-04-25 12:09:31','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('ae5c3363-cf8f-48a9-9676-701a7b0a7ca5','已付款',7460,1,'2379296','2020-04-25 12:09:23','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('b1fb2399-7cf2-4af5-960a-a4d77f4803b8','已提交',2690,3,'6686018','2020-04-25 12:09:55','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('b21c7dbd-dabd-4610-94b9-d7039866a8eb','已提交',6310,2,'1552851','2020-04-25 12:09:15','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('b4bfd4b7-51f5-480e-9e23-8b1579e36248','已提交',4000,1,'3260372','2020-04-25 12:09:35','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('b63983cc-2b59-4992-84c6-9810526d0282','已提交',7370,3,'3107867','2020-04-25 12:08:45','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('b63983cc-2b59-4992-84c6-9810526d0282','已付款',7370,3,'3107867','2020-04-25 12:08:46','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('bf60b752-1ccc-43bf-9bc3-b2aeccacc0ed','已提交',720,2,'5034117','2020-04-25 12:09:03','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('c808addc-8b8b-4d89-99b1-db2ed52e61b4','已提交',3630,1,'6435854','2020-04-25 12:09:10','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('cc9dbd20-cf9f-4097-ae8b-4e73db1e4ba1','已付款',5000,0,'2007322','2020-04-25 12:08:38','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('ccceaf57-a5ab-44df-834a-e7b32c63efc1','已提交',2660,2,'7928516','2020-04-25 12:09:42','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('ccceaf57-a5ab-44df-834a-e7b32c63efc1','已付款',2660,2,'7928516','2020-04-25 12:09:47','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('ccceaf57-a5ab-44df-834a-e7b32c63efc1','已完成',2660,2,'7928516','2020-04-25 12:09:59','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('d7be5c39-e07c-40e8-bf09-4922fbc6335c','已付款',8750,2,'1250995','2020-04-25 12:09:09','食品;家用电器;');
UPSERT INTO "ORDER_DTL" VALUES('dfe16df7-4a46-4b6f-9c6d-083ec215218e','已完成',410,0,'1923817','2020-04-25 12:09:56','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('e1241ad4-c9c1-4c17-93b9-ef2c26e7f2b2','已付款',6760,0,'2457464','2020-04-25 12:08:54','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('e1241ad4-c9c1-4c17-93b9-ef2c26e7f2b2','已提交',6760,0,'2457464','2020-04-25 12:08:59','数码;女装;');
UPSERT INTO "ORDER_DTL" VALUES('e180a9f2-9f80-4b6d-99c8-452d6c037fc7','已付款',8120,2,'7645270','2020-04-25 12:09:28','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('e180a9f2-9f80-4b6d-99c8-452d6c037fc7','已完成',8120,2,'7645270','2020-04-25 12:09:32','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('e4418843-9ac0-47a7-bfd8-d61c4d296933','已付款',8170,2,'7695668','2020-04-25 12:09:11','家用电器;;电脑;');
UPSERT INTO "ORDER_DTL" VALUES('e8b3bb37-1019-4492-93c7-305177271a71','已完成',2560,2,'4405460','2020-04-25 12:10:05','男装;男鞋;');
UPSERT INTO "ORDER_DTL" VALUES('eb1a1a22-953a-42f1-b594-f5dfc8fb6262','已完成',2370,2,'8233485','2020-04-25 12:09:24','机票;文娱;');
UPSERT INTO "ORDER_DTL" VALUES('ecfd18f5-45f2-4dcd-9c47-f2ad9b216bd0','已付款',8070,3,'6387107','2020-04-25 12:09:04','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('ecfd18f5-45f2-4dcd-9c47-f2ad9b216bd0','已完成',8070,3,'6387107','2020-04-25 12:09:17','酒店;旅游;');
UPSERT INTO "ORDER_DTL" VALUES('f1226752-7be3-4702-a496-3ddba56f66ec','已付款',4410,3,'1981968','2020-04-25 12:10:10','维修;手机;');
UPSERT INTO "ORDER_DTL" VALUES('f642b16b-eade-4169-9eeb-4d5f294ec594','已提交',4010,1,'6463215','2020-04-25 12:09:29','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('f642b16b-eade-4169-9eeb-4d5f294ec594','已付款',4010,1,'6463215','2020-04-25 12:09:33','男鞋;汽车;');
UPSERT INTO "ORDER_DTL" VALUES('f8f3ca6f-2f5c-44fd-9755-1792de183845','已付款',5950,3,'4060214','2020-04-25 12:09:12','机票;文娱;');
我们发现数据分布在每一个Region中。
1.1.2 加盐指定数量分区
drop table if exists ORDER_DTL;
create table if not exists ORDER_DTL(
"id" varchar primary key,
C1."status" varchar,
C1."money" float,
C1."pay_way" integer,
C1."user_id" varchar,
C1."operation_time" varchar,
C1."category" varchar
)
CONPRESSION='GZ', SALT_BUCKETS=10;
我们在HBase的Web UI中可以查看到生成了10个Region
插入数据后,发现数据分部在每一个Region中。
查看HBase中的表,我们发现Phoenix在每个ID前,都添加了一个Hash值,用来将分布分布到不同的Region中。
hbase(main):018:0> scan "ORDER_DTL", {LIMIT => 1}
ROW COLUMN+CELL
\x000f46d542-34cb-4ef4-b7fe-6dcfa5f14751 column=C1:\x00\x00\x00\x00, timestamp=1589268724801, value=x
\x000f46d542-34cb-4ef4-b7fe-6dcfa5f14751 column=C1:\x80\x0B, timestamp=1589268724801, value=\xE5\xB7\xB2\xE4\xBB\x98\xE6\xAC\xBE
\x000f46d542-34cb-4ef4-b7fe-6dcfa5f14751 column=C1:\x80\x0C, timestamp=1589268724801, value=\xC6\x12\x90\x01
\x000f46d542-34cb-4ef4-b7fe-6dcfa5f14751 column=C1:\x80\x0D, timestamp=1589268724801, value=\x80\x00\x00\x01
\x000f46d542-34cb-4ef4-b7fe-6dcfa5f14751 column=C1:\x80\x0E, timestamp=1589268724801, value=2993700
\x000f46d542-34cb-4ef4-b7fe-6dcfa5f14751 column=C1:\x80\x0F, timestamp=1589268724801, value=2020-04-25 12:09:46
\x000f46d542-34cb-4ef4-b7fe-6dcfa5f14751 column=C1:\x80\x10, timestamp=1589268724801, value=\xE7\xBB\xB4\xE4\xBF\xAE;\xE6\x89\x8B\xE6\x9C\xBA;
1 row(s)
注意:CONPRESSION和SALT_BUCKETS之间需要使用逗号分隔,否则会出现语法错误
2.1 hbase热点问题解决(预分区)
一、出现热点问题原因
1、hbase的中的数据是按照字典序排序的,当大量连续的rowkey集中写在个别的region,各个region之间数据分布不均衡;
2、创建表时没有提前预分区,创建的表默认只有一个region,大量的数据写入当前region;
3、创建表已经提前预分区,但是设计的rowkey没有规律可循,设计的rowkey应该由regionNo+messageId组成。
二、如何解决热点问题
解决这个问题,关键是要设计出可以让数据分布均匀的rowkey,与关系型数据库一样,rowkey是用来检索记录的主键。访问hbase table中的行,rowkey 可以是任意字符串(最大长度 是 64KB,实际应用中长度一般为 10-100bytes),在hbase内部,rowkey保存为字节数组,存储时,数据按照rowkey的字典序排序存储。
创建表命令:
create 'testTable333', {NAME => 'cf', DATA_BLOCK_ENCODING => 'NONE', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION => 'snappy', MIN_VERSIONS => '0', TTL => '15552000', KEEP_DELETED_CELLS => 'false', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true', METADATA => {'ENCODE_ON_DISK' => 'true'}}, {SPLITS => ['0001|','0002|','0003|','0004|','0005|','0006|','0007|','0008|','0009|','00010|']}
我这里预分10个region,执行命令之后,在hbase的console中可以看到以下信息,说明预分区ok了!!!
1、第一种设计rowkey方式:随机数+messageId,如果想让最近的数据快速get到,可以将时间戳加上,我这里的region是0001|到0009|开头的,因为hbase的数据是字典序排序的,所以如果我生成的 rowkey=0002rer4343343422,则当前这条数据就会保存到0001|~0002|这个region里,因为我的messageId都是字母+数字,“|”的ASCII值大于字母、数字。
生成regionNo的工具类:RegionUtils
package com.cn.dl;
import java.util.Random;
/**
* Created by Tiger on 2018/4/18.
*/
public class RegionUtils {
//十个预分区
private static final int REGION_NUM = 10;
//存放regionNo:0001,0002,...0009,0010
private static final String[] REGION_ARRAY = new String[REGION_NUM];
static {
initRegionArray();
}
/**
* 生成regionNo
* */
private static void initRegionArray(){
for(int i=1; i<=REGION_NUM; i++){
String regionNo = String.valueOf(i);
while (regionNo.length() < 4){
regionNo = "0" + regionNo;
}
REGION_ARRAY[i-1] = regionNo;
}
}
/**
* 随机获取regionNo
* @return regionNo
* */
public static String getRegionNo(){
Random random = new Random();
return REGION_ARRAY[random.nextInt(10)];
}
public static void main(String[] args) {
int i= 0;
while (i < 100){
System.out.println(getRegionNo());
i++;
}
}
}
—— 生成rowKey:
// TODO: 2018/12/18 只是一个生成rowKey的案例
public void execute(Tuple tuple) {
try {
JSONObject json = JSONObject.parseObject(tuple.getStringByField("messageSpout"));
String messageId = json.getString("messageId");
// TODO: 2018/12/18 生成rowKey:regionNo+时间戳+messageId ,加上时间戳在hbase中可以提高查询效率
String rowKey = RegionUtils.getRegionNo() + System.currentTimeMillis() + messageId;
json.put("rowKey",rowKey);
System.out.println(json.toJSONString());
}catch (Exception e){
e.printStackTrace();
}finally {
collector.ack(tuple);
}
}
打印结果,rowkey=regionNo+时间戳+messageId,前缀是随机的
{"name":"name2","messageId":"b998a8dfc05a4a819284213d4e727a85","age":12,"rowKey":"00041545105001972b998a8dfc05a4a819284213d4e727a85"}
{"name":"name3","messageId":"799affbf346641e8a00bfeee78ffcdb4","age":13,"rowKey":"00031545105002973799affbf346641e8a00bfeee78ffcdb4"}
{"name":"name4","messageId":"bbdf16b9a12b4fa09b060402f9522fed","age":14,"rowKey":"00051545105003973bbdf16b9a12b4fa09b060402f9522fed"}
{"name":"name5","messageId":"03c119868cd742459464df53c3827147","age":15,"rowKey":"0009154510500497403c119868cd742459464df53c3827147"}
{"name":"name6","messageId":"84c682681cdc4ac09ad3d270741074d3","age":16,"rowKey":"0002154510500597484c682681cdc4ac09ad3d270741074d3"}
{"name":"name7","messageId":"aecbd65f3f434452ab4a924d8e42b947","age":17,"rowKey":"00091545105006976aecbd65f3f434452ab4a924d8e42b947"}
{"name":"name8","messageId":"3bcb23e414e5450898b6b0eefff4d80a","age":18,"rowKey":"000315451050079783bcb23e414e5450898b6b0eefff4d80a"}
{"name":"name9","messageId":"40be62bfcea24ea799ae6f191241c5e8","age":19,"rowKey":"0002154510500897840be62bfcea24ea799ae6f191241c5e8"}
{"name":"name10","messageId":"94c220cd10d141c89cf08e61d0a48e7f","age":20,"rowKey":"0007154510500997894c220cd10d141c89cf08e61d0a48e7f"}
{"name":"name11","messageId":"0796f735b1ba43beb7b15d63c4fd4ec8","age":21,"rowKey":"000515451050109780796f735b1ba43beb7b15d63c4fd4ec8"}
{"name":"name12","messageId":"05ac8417e52443f48bf2c56879b3e2c6","age":22,"rowKey":"0009154510501197805ac8417e52443f48bf2c56879b3e2c6"}
{"name":"name13","messageId":"b87484b633b747ba8320cdf69334459b","age":23,"rowKey":"00101545105012978b87484b633b747ba8320cdf69334459b"}
{"name":"name14","messageId":"84c6daf1cdfd4c0f977a8742ee528977","age":24,"rowKey":"0005154510501397984c6daf1cdfd4c0f977a8742ee528977"}
{"name":"name15","messageId":"8e01e5c53d024de18507ed2a98e38519","age":25,"rowKey":"000415451050149808e01e5c53d024de18507ed2a98e38519"}
{"name":"name16","messageId":"48939394581946e881b91e797659d6ca","age":26,"rowKey":"0005154510501598248939394581946e881b91e797659d6ca"}
{"name":"name17","messageId":"b5c2024c721642a5a2b4cc8682c7cd40","age":27,"rowKey":"00021545105016981b5c2024c721642a5a2b4cc8682c7cd40"}
{"name":"name18","messageId":"4efb10fc351947f6a78761e7b3bf1783","age":28,"rowKey":"000215451050179824efb10fc351947f6a78761e7b3bf1783"}
{"name":"name19","messageId":"a4a27bafd5f749d0b08d9eb832120737","age":29,"rowKey":"00041545105018983a4a27bafd5f749d0b08d9eb832120737"}
{"name":"name20","messageId":"1d90758cbcc2495197b2ef0a05e58610","age":30,"rowKey":"000815451050199831d90758cbcc2495197b2ef0a05e58610"}
{"name":"name21","messageId":"47ae3fb0e3914496b75445df92d0a133","age":31,"rowKey":"0002154510502098347ae3fb0e3914496b75445df92d0a133"}
{"name":"name22","messageId":"419dccbeb2b74484997bd5373f8347af","age":32,"rowKey":"00071545105021982419dccbeb2b74484997bd5373f8347af"}
{"name":"name23","messageId":"51e38da4c9c74542bcd38be704ff3fec","age":33,"rowKey":"0005154510502298351e38da4c9c74542bcd38be704ff3fec"}
{"name":"name24","messageId":"e99e783220d14f63be1f967f82bd69fb","age":34,"rowKey":"00081545105023983e99e783220d14f63be1f967f82bd69fb"}
{"name":"name25","messageId":"0e15f181464146598dfe44976676c706","age":35,"rowKey":"000915451050249840e15f181464146598dfe44976676c706"}
{"name":"name26","messageId":"d0d8c0e939b44a688915714b85d61b1f","age":36,"rowKey":"00101545105025985d0d8c0e939b44a688915714b85d61b1f"}
{"name":"name27","messageId":"4a7d3404871a4137b9db4e24209f1346","age":37,"rowKey":"000515451050269844a7d3404871a4137b9db4e24209f1346"}
{"name":"name28","messageId":"51a5f39f032241fcaa6e5956a9ddb474","age":38,"rowKey":"0002154510502798651a5f39f032241fcaa6e5956a9ddb474"}
{"name":"name29","messageId":"c54ec6f46fa047fdae6187afad55e8f0","age":39,"rowKey":"00011545105028986c54ec6f46fa047fdae6187afad55e8f0"}
{"name":"name30","messageId":"076322780dae4748997006d53aa246c1","age":40,"rowKey":"00021545105029987076322780dae4748997006d53aa246c1"}
{"name":"name31","messageId":"563c258840a84f12ad1a5341381b163c","age":41,"rowKey":"00101545105030986563c258840a84f12ad1a5341381b163c"}
{"name":"name32","messageId":"377f3fbe63634fc4aa93b1737dd462da","age":42,"rowKey":"00051545105031988377f3fbe63634fc4aa93b1737dd462da"}
{"name":"name33","messageId":"5c1af0f26cc94d0b85d0fbca617e57ad","age":43,"rowKey":"000715451050329895c1af0f26cc94d0b85d0fbca617e57ad"}
{"name":"name34","messageId":"2d3050a0b3dd42df97ef87d45dbc0d26","age":44,"rowKey":"000715451050339902d3050a0b3dd42df97ef87d45dbc0d26"}
{"name":"name35","messageId":"526401e1f5ab45aeb95421e0ac200e4b","age":45,"rowKey":"00041545105034990526401e1f5ab45aeb95421e0ac200e4b"}总结:
这种设计的rowkey可以解决热点问题,但是要建立关联表,比如将rowkey保存到数据库或者nosql数据库中,因为前面的regionNo是随机的,不知道 对应数据在hbase的rowkey是多少;同一批数据,因为这个regionNo是随机的,所以要到多个region中get数据,不能使用startkey和endkey去get数据。
2、第二种设计rowkey的方式:通过messageId映射regionNo,这样既可以让数据均匀分布到各个region中,同时可以根据startkey和endkey可以get到同一批数据,messageId映射regionNo,使用一致性hash算法解决,一致性哈希算法在1997年由麻省理工学院的Karger等人在解决分布式Cache中提出的,设计目标是为了解决因特网中的热点(Hot spot)问题,
public class ConsistentHash<T> implements Serializable{
private static final long serialVersionUID = 1L;
private final HashFunction hashFunction;
//每个regions的虚拟节点个数
private final int numberOfReplicas;
//存储虚拟节点的hash值到真实节点的映射
private final SortedMap<Long, String> circle = new TreeMap<Long, String>();
public ConsistentHash(HashFunction hashFunction, int numberOfReplicas, Collection<String> nodes) {
this.hashFunction = hashFunction;
this.numberOfReplicas = numberOfReplicas;
for (String node : nodes){
add(node);
}
}
/**
* 添加节点
* @param node
* @see java.util.TreeMap
* */
public void add(String node) {
for (int i = 0; i < numberOfReplicas; i++)
/*
* 不同的虚拟节点(i不同)有不同的hash值,但都对应同一个实际机器node
* 虚拟node一般是均衡分布在环上的,数据存储在顺时针方向的虚拟node上
*/
circle.put(hashFunction.getHashValue(node.toString() + i), node);
}
/**
* 移除节点
* @param node
* @see java.util.TreeMap
* */
public void remove(String node) {
for (int i = 0; i < numberOfReplicas; i++)
circle.remove(hashFunction.getHashValue(node.toString() + i));
}
/**
* 获取对应key的hashcode值,然后根据hashcode获取当前数据储存的真实节点
* */
public String get(Object key) {
if (circle.isEmpty())
return null;
//获取对应key的hashcode值
long hash = hashFunction.getHashValue((String) key);
//数据映射在两台虚拟机器所在环之间,就需要按顺时针方向寻找机器
if (!circle.containsKey(hash)) {
SortedMap<Long, String> tailMap = circle.tailMap(hash);
hash = tailMap.isEmpty() ? circle.firstKey() : tailMap.firstKey();
}
return circle.get(hash);
}
/**
* 获取hash环节点大小
* @return
* */
public long getSize() {
return circle.size();
}
/**
* 获取double类型数据的小数位后四位小数
* @param num
* @return
* */
public String getDecimalPoint(double num){
DecimalFormat df = new DecimalFormat("0.0000");
return df.format(num);
}
}
public class HashFunction implements Serializable{
private static final long serialVersionUID = 1L;
/**
* 获取对应字符串的hashCode值
* @param key
* @return
* */
public long getHashValue(String key) {
final int p = 1677761999;
int hash = (int) 216613626111L;
for (int i = 0; i < key.length(); i++)
hash = (hash ^ key.charAt(i)) * p;
hash += hash << 13;
hash ^= hash >> 8;
hash += hash << 3;
hash ^= hash >> 18;
hash += hash << 5;
// 如果算出来的值为负数则取其绝对值
if (hash < 0)
hash = Math.abs(hash);
return hash;
}
}
————————————————
我目前满意第二种方式,然后在es中建立关联表,get数据时,先在es中get到rowkey,然后在hbase中获取数据,这个根据自己的业务设计。
写的内容有问题,欢迎来吐槽,我会及时修改,谢谢!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。