今天,我在做kudu的分区测试,那个结果让我很困惑
首先,我用presto创建两个kudu表
表1
CREATE TABLE events_one (
id integer WITH (primary_key = true),
event_time timestamp,
score Decimal(8,2),
message varchar
) WITH (
partition_by_hash_columns = ARRAY['id'],
partition_by_hash_buckets = 36 ,
number_of_replicas = 1
);
表二
CREATE TABLE events_two (
id integer WITH (primary_key = true),
event_time timestamp WITH (primary_key = true),
score Decimal(8,2),
message varchar
) WITH (
partition_by_hash_columns = ARRAY['id'],
partition_by_hash_buckets = 36,
partition_by_range_columns = ARRAY['event_time'],
range_partitions = '[
{"lower": "2015-01-01T00:00:00", "upper": "2015-03-01T00:00:00"},
{"lower": "2015-03-01T00:00:00", "upper": "2015-06-01T00:00:00"},
{"lower": "2015-06-01T00:00:00", "upper": "2015-09-01T00:00:00"},
{"lower": "2015-09-01T00:00:00", "upper": "2016-01-01T00:00:00"},
{"lower": "2016-01-01T00:00:00", "upper": "2016-03-01T00:00:00"},
{"lower": "2016-03-01T00:00:00", "upper": "2016-06-01T00:00:00"},
{"lower": "2016-06-01T00:00:00", "upper": "2016-09-01T00:00:00"},
{"lower": "2016-09-01T00:00:00", "upper": "2017-01-01T00:00:00"},
{"lower": "2017-01-01T00:00:00", "upper": "2017-03-01T00:00:00"},
{"lower": "2017-03-01T00:00:00", "upper": "2017-06-01T00:00:00"},
{"lower": "2017-06-01T00:00:00", "upper": "2017-09-01T00:00:00"},
{"lower": "2017-09-01T00:00:00", "upper": "2018-01-01T00:00:00"}
]',number_of_replicas = 1 );
在每个表中插入10000000行数据
List<Date> eventsTime = getDateTimes("2015-01-01","2018-01-01");
List<TestTableTwo> entitys = new ArrayList<>();
for(int i=1;i<10000000;i++){
TestTableTwo entity = new TestTableTwo() ;
entity.setId(i);
entity.setEvent_time(eventsTime.get(new Random().nextInt(eventsTime.size())));
entity.setScore(score);
entity.setMessage(msg);
entitys.add(entity);
if(i%3000==0){
KuduUtil.save(entitys);
entitys = new ArrayList<>();
System.out.println(i);
}
}
最后,我测试了查询速度
public static void main(String[] args) throws Exception{
Class.forName("com.facebook.presto.jdbc.PrestoDriver");
Connection conn= DriverManager.getConnection("jdbc:presto://10.100.218.110:8090/kudu/tcdb", "root", null);
Statement sm = conn.createStatement();
String sql1 = "select count(1) from events_two where event_time < timestamp '2017-01-03 00:00:00'";
Long AllTime = 0L ;
int per = 100 ;
for(int i=0;i<per;i++) {
long startTime = System.currentTimeMillis();
ResultSet sets = sm.executeQuery(sql1);
while (sets.next()) {
String id = sets.getString(1);
System.out.println(id);
}
long endTime = System.currentTimeMillis();
AllTime += (endTime-startTime);
}
System.out.println("testSql:"+sql1);
System.out.println("per cost :"+AllTime/per+"ms");
}
结果是:
---
testSql:select * from events_one where id = 200
per cost :61ms
testSql:select * from events_two where id = 200
per cost :54ms
---
testSql:select sum(score) from events_one where event_time < timestamp '2017-01-03 00:00:00'
per cost :448ms
testSql:select sum(score) from events_two where event_time < timestamp '2017-01-03 00:00:00'
per cost :736ms
---
testSql:select sum(score) from events_one where id<5000000 and event_time < timestamp '2017-01-03 00:00:00'
per cost :281ms
testSql:select sum(score) from events_two where id<5000000 and event_time < timestamp '2017-01-03 00:00:00'
per cost :745ms
所有数据在“2015-01-01”-“2018-01-01”中是随机的
为什么kudu表按hash和range分区比按hash分区慢?
1条答案
按热度按时间lnvxswe21#
Kudu博士说,我发现这样一句话: