Skip to content

Commit

Permalink
skip unsupported data types
Browse files Browse the repository at this point in the history
Signed-off-by: Murphy <[email protected]>
  • Loading branch information
murphyatwork committed Nov 8, 2024
1 parent c3aef83 commit 46bfe29
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 117 deletions.
28 changes: 23 additions & 5 deletions be/src/storage/rowset/data_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "storage/olap_common.h"
#include "storage/types.h"
#include "storage/zone_map_detail.h"
#include "types/logical_type.h"
#include "util/runtime_profile.h"

namespace starrocks {
Expand Down Expand Up @@ -86,6 +87,28 @@ double SortableZoneMap::width(const ZoneMapDetail& zone) {
return width(zone.min_value(), zone.max_value());
}

bool SortableZoneMap::is_support_data_type(LogicalType type) {
// TODO: support decimal type
switch (type) {
case TYPE_TINYINT:
case TYPE_SMALLINT:
case TYPE_INT:
case TYPE_BIGINT:
case TYPE_LARGEINT:
case TYPE_UNSIGNED_TINYINT:
case TYPE_UNSIGNED_SMALLINT:
case TYPE_UNSIGNED_INT:
case TYPE_UNSIGNED_BIGINT:
case TYPE_FLOAT:
case TYPE_DOUBLE:
case TYPE_DATE:
case TYPE_DATETIME:
return true;
default:
return false;
}
}

double SortableZoneMap::width(const Datum& lhs, const Datum& rhs) {
if (lhs.is_null() || rhs.is_null()) {
return 0;
Expand Down Expand Up @@ -113,11 +136,6 @@ double SortableZoneMap::width(const Datum& lhs, const Datum& rhs) {
return rhs.get_float() - lhs.get_float();
case TYPE_DOUBLE:
return rhs.get_double() - lhs.get_double();
case TYPE_DECIMAL:
case TYPE_DECIMAL32:
case TYPE_DECIMAL64:
case TYPE_DECIMAL128:
return rhs.get_decimal() - lhs.get_decimal();
case TYPE_DATE:
return rhs.get_date().julian() - lhs.get_date().julian();
case TYPE_DATETIME:
Expand Down
3 changes: 3 additions & 0 deletions be/src/storage/rowset/data_sample.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ struct SortableZoneMap {
std::vector<size_t> page_indices; // Keep zonemap immutable, sort by this page_indices
std::vector<PagesWithZoneMap> histogram; // Only exist if the zonemap is diverse enough

static bool is_support_data_type(LogicalType type);

void sort();
bool is_diverse();
void build_histogram(size_t buckets);
Expand All @@ -107,6 +109,7 @@ class PageDataSample final : public DataSample {
void with_zonemap(std::shared_ptr<SortableZoneMap> zonemap) { _zonemap = std::move(zonemap); }

private:
bool _is_histogram_supported_type(LogicalType type) const;
void _prepare_histogram(OlapReaderStatistics* stats);
bool _has_histogram() const;
StatusOr<RowIdSparseRange> _bernoulli_sample(OlapReaderStatistics* stats);
Expand Down
2 changes: 1 addition & 1 deletion be/src/storage/rowset/segment_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2089,7 +2089,7 @@ StatusOr<RowIdSparseRange> SegmentIterator::_sample_by_page() {
int64_t random_seed = _opts.sample_options.random_seed;
auto sampler = DataSample::make_page_sample(probability_percent, random_seed, num_data_pages, page_indexer);

if (column_reader->has_zone_map()) {
if (column_reader->has_zone_map() && SortableZoneMap::is_support_data_type(column_reader->column_type())) {
IndexReadOptions opts = _index_read_options(cid);
ASSIGN_OR_RETURN(auto zonemap, column_reader->get_raw_zone_map(opts));
auto sorted = std::make_shared<SortableZoneMap>(column_reader->column_type(), std::move(zonemap));
Expand Down
108 changes: 0 additions & 108 deletions be/src/storage/rowset/tmp.sql

This file was deleted.

11 changes: 8 additions & 3 deletions tools/benchmark/table_sample.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
mysql_user=${MYSQL_USER:-"root"}
mysql_host=${MYSQL_HOST:-"127.0.0.1"}
mysql_port=${MYSQL_PORT:-"9030"}
mysql_database=ssb_100g
mysql_database=tpcds_100g

query_columns="select table_name, column_name from information_schema.columns where table_schema='ssb_100g' and table_name='lineorder'"
query_columns="select table_name, column_name from information_schema.columns where table_schema='tpcds_100g' \
and table_name in ('catalog_sales', 'store_returns', 'store_sales', 'web_sales') "


percents=(
Expand All @@ -33,6 +34,10 @@ execute_sql() {
local query="$2"

result=$(mysql -u ${mysql_user} -P ${mysql_port} -h ${mysql_host} ${mysql_database} -vvv -e "$query" 2>&1)
if [ $? -ne 0 ]; then
echo "MySQL execution failed: ${result}"
exit 1;
fi

execution_time=$(echo "$result" | grep -oP '\(\K[0-9]+\.[0-9]+(?= sec\))')
count=$(echo "$result" | grep -A2 "count" | tail -n1 | awk '{print $2}')
Expand All @@ -44,7 +49,7 @@ execute_sql() {


while read table col; do
full_column="table=$table.$col"
full_column="column=$table.$col"
method="method=full"
percent="percent=100"

Expand Down

0 comments on commit 46bfe29

Please sign in to comment.