|
1 | 1 | import collections
|
2 | 2 | import random
|
| 3 | +import sys |
| 4 | +import re |
3 | 5 |
|
4 | 6 | from raco.algebra import *
|
5 | 7 | from raco.expression import NamedAttributeRef as AttRef
|
6 | 8 | from raco.expression import UnnamedAttributeRef as AttIndex
|
7 | 9 | from raco.expression import StateVar
|
| 10 | +from raco.expression import aggregate |
8 | 11 |
|
9 | 12 | from raco.backends.myria import (
|
10 | 13 | MyriaShuffleConsumer, MyriaShuffleProducer, MyriaHyperShuffleProducer,
|
11 |
| - MyriaBroadcastConsumer, MyriaQueryScan, MyriaSplitConsumer) |
| 14 | + MyriaBroadcastConsumer, MyriaQueryScan, MyriaSplitConsumer, MyriaDupElim, |
| 15 | + MyriaGroupBy) |
12 | 16 | from raco.backends.myria import (MyriaLeftDeepTreeAlgebra,
|
13 | 17 | MyriaHyperCubeAlgebra)
|
14 | 18 | from raco.compile import optimize
|
@@ -1003,3 +1007,170 @@ def test_projecting_join_maintains_partitioning(self):
|
1003 | 1007 | # (in general, info could be h($0) && h($2)
|
1004 | 1008 | self.assertEquals(pp.partitioning().hash_partitioned,
|
1005 | 1009 | frozenset([AttIndex(0)]))
|
| 1010 | + |
| 1011 | + def test_no_shuffle_for_partitioned_distinct(self): |
| 1012 | + """Do not shuffle for Distinct if already partitioned""" |
| 1013 | + |
| 1014 | + query = """ |
| 1015 | + r = scan({part}); |
| 1016 | + t = select distinct r.h from r; |
| 1017 | + store(t, OUTPUT);""".format(part=self.part_key) |
| 1018 | + |
| 1019 | + lp = self.get_logical_plan(query) |
| 1020 | + pp = self.logical_to_physical(lp) |
| 1021 | + |
| 1022 | + # shuffles should be removed and distinct not decomposed into two |
| 1023 | + self.assertEquals(self.get_count(pp, MyriaShuffleConsumer), 0) |
| 1024 | + self.assertEquals(self.get_count(pp, MyriaShuffleProducer), 0) |
| 1025 | + self.assertEquals(self.get_count(pp, MyriaDupElim), 1) |
| 1026 | + |
| 1027 | + self.db.evaluate(pp) |
| 1028 | + result = self.db.get_table('OUTPUT') |
| 1029 | + expected = dict([((h,), 1) for _, h, _ in self.part_data]) |
| 1030 | + self.assertEquals(result, expected) |
| 1031 | + |
| 1032 | + def test_no_shuffle_for_partitioned_groupby(self): |
| 1033 | + """Do not shuffle for groupby if already partitioned""" |
| 1034 | + |
| 1035 | + query = """ |
| 1036 | + r = scan({part}); |
| 1037 | + t = select r.h, MIN(r.i) from r; |
| 1038 | + store(t, OUTPUT);""".format(part=self.part_key) |
| 1039 | + |
| 1040 | + lp = self.get_logical_plan(query) |
| 1041 | + pp = self.logical_to_physical(lp) |
| 1042 | + |
| 1043 | + # shuffles should be removed and the groupby not decomposed into two |
| 1044 | + self.assertEquals(self.get_count(pp, MyriaShuffleConsumer), 0) |
| 1045 | + self.assertEquals(self.get_count(pp, MyriaShuffleProducer), 0) |
| 1046 | + self.assertEquals(self.get_count(pp, MyriaGroupBy), 1) |
| 1047 | + |
| 1048 | + def test_partition_aware_groupby_into_sql(self): |
| 1049 | + """No shuffle for groupby also causes it to be pushed into sql""" |
| 1050 | + |
| 1051 | + query = """ |
| 1052 | + r = scan({part}); |
| 1053 | + t = select r.h, MIN(r.i) from r; |
| 1054 | + store(t, OUTPUT);""".format(part=self.part_key) |
| 1055 | + |
| 1056 | + lp = self.get_logical_plan(query) |
| 1057 | + pp = self.logical_to_physical(lp, push_sql=True, |
| 1058 | + push_sql_grouping=True) |
| 1059 | + |
| 1060 | + # shuffles should be removed and the groupby not decomposed into two |
| 1061 | + self.assertEquals(self.get_count(pp, MyriaShuffleConsumer), 0) |
| 1062 | + self.assertEquals(self.get_count(pp, MyriaShuffleProducer), 0) |
| 1063 | + |
| 1064 | + # should be pushed |
| 1065 | + self.assertEquals(self.get_count(pp, MyriaGroupBy), 0) |
| 1066 | + self.assertEquals(self.get_count(pp, MyriaQueryScan), 1) |
| 1067 | + |
| 1068 | + self.db.evaluate(pp) |
| 1069 | + result = self.db.get_table('OUTPUT') |
| 1070 | + temp = dict([(h, sys.maxsize) for _, h, _ in self.part_data]) |
| 1071 | + for _, h, i in self.part_data: |
| 1072 | + temp[h] = min(temp[h], i) |
| 1073 | + expected = dict(((h, i), 1) for h, i in temp.items()) |
| 1074 | + |
| 1075 | + self.assertEquals(result, expected) |
| 1076 | + |
| 1077 | + def test_partition_aware_distinct_into_sql(self): |
| 1078 | + """No shuffle for distinct also causes it to be pushed into sql""" |
| 1079 | + |
| 1080 | + query = """ |
| 1081 | + r = scan({part}); |
| 1082 | + t = select distinct r.h from r; |
| 1083 | + store(t, OUTPUT);""".format(part=self.part_key) |
| 1084 | + |
| 1085 | + lp = self.get_logical_plan(query) |
| 1086 | + pp = self.logical_to_physical(lp, push_sql=True) |
| 1087 | + |
| 1088 | + # shuffles should be removed and the groupby not decomposed into two |
| 1089 | + self.assertEquals(self.get_count(pp, MyriaShuffleConsumer), 0) |
| 1090 | + self.assertEquals(self.get_count(pp, MyriaShuffleProducer), 0) |
| 1091 | + |
| 1092 | + # should be pushed |
| 1093 | + self.assertEquals(self.get_count(pp, MyriaGroupBy), 0) # sanity |
| 1094 | + self.assertEquals(self.get_count(pp, MyriaDupElim), 0) |
| 1095 | + self.assertEquals(self.get_count(pp, MyriaQueryScan), 1) |
| 1096 | + |
| 1097 | + self.db.evaluate(pp) |
| 1098 | + result = self.db.get_table('OUTPUT') |
| 1099 | + expected = dict([((h,), 1) for _, h, _ in self.part_data]) |
| 1100 | + self.assertEquals(result, expected) |
| 1101 | + |
| 1102 | + def test_push_half_groupby_into_sql(self): |
| 1103 | + """Push the first group by of decomposed group by into sql""" |
| 1104 | + |
| 1105 | + query = """ |
| 1106 | + r = scan({part}); |
| 1107 | + t = select r.i, MIN(r.h) from r; |
| 1108 | + store(t, OUTPUT);""".format(part=self.part_key) |
| 1109 | + |
| 1110 | + lp = self.get_logical_plan(query) |
| 1111 | + pp = self.logical_to_physical(lp, push_sql=True, |
| 1112 | + push_sql_grouping=True) |
| 1113 | + |
| 1114 | + # wrong partition, so still has shuffle |
| 1115 | + self.assertEquals(self.get_count(pp, MyriaShuffleConsumer), 1) |
| 1116 | + self.assertEquals(self.get_count(pp, MyriaShuffleProducer), 1) |
| 1117 | + |
| 1118 | + # one group by should be pushed |
| 1119 | + self.assertEquals(self.get_count(pp, MyriaGroupBy), 1) |
| 1120 | + self.assertEquals(self.get_count(pp, MyriaQueryScan), 1) |
| 1121 | + |
| 1122 | + self.db.evaluate(pp) |
| 1123 | + result = self.db.get_table('OUTPUT') |
| 1124 | + temp = dict([(i, sys.maxsize) for _, _, i in self.part_data]) |
| 1125 | + for _, h, i in self.part_data: |
| 1126 | + temp[i] = min(temp[i], h) |
| 1127 | + expected = dict(((k, v), 1) for k, v in temp.items()) |
| 1128 | + |
| 1129 | + self.assertEquals(result, expected) |
| 1130 | + |
| 1131 | + def _check_aggregate_functions_pushed( |
| 1132 | + self, |
| 1133 | + func, |
| 1134 | + expected, |
| 1135 | + override=False): |
| 1136 | + if override: |
| 1137 | + agg = func |
| 1138 | + else: |
| 1139 | + agg = "{func}(r.i)".format(func=func) |
| 1140 | + |
| 1141 | + query = """ |
| 1142 | + r = scan({part}); |
| 1143 | + t = select r.h, {agg} from r; |
| 1144 | + store(t, OUTPUT);""".format(part=self.part_key, agg=agg) |
| 1145 | + |
| 1146 | + lp = self.get_logical_plan(query) |
| 1147 | + pp = self.logical_to_physical(lp, push_sql=True, |
| 1148 | + push_sql_grouping=True) |
| 1149 | + |
| 1150 | + self.assertEquals(self.get_count(pp, MyriaQueryScan), 1) |
| 1151 | + |
| 1152 | + for op in pp.walk(): |
| 1153 | + if isinstance(op, MyriaQueryScan): |
| 1154 | + self.assertTrue(re.search(expected, op.sql)) |
| 1155 | + |
| 1156 | + def test_aggregate_AVG_pushed(self): |
| 1157 | + """AVG is translated properly for postgresql. This is |
| 1158 | + a function not in SQLAlchemy""" |
| 1159 | + self._check_aggregate_functions_pushed( |
| 1160 | + aggregate.AVG.__name__, 'avg') |
| 1161 | + |
| 1162 | + def test_aggregate_STDDEV_pushed(self): |
| 1163 | + """STDEV is translated properly for postgresql. This is |
| 1164 | + a function that is named differently in Raco and postgresql""" |
| 1165 | + self._check_aggregate_functions_pushed( |
| 1166 | + aggregate.STDEV.__name__, 'stddev_samp') |
| 1167 | + |
| 1168 | + def test_aggregate_COUNTALL_pushed(self): |
| 1169 | + """COUNTALL is translated properly for postgresql. This is |
| 1170 | + a function that is expressed differently in Raco and postgresql""" |
| 1171 | + |
| 1172 | + # MyriaL parses count(*) to Raco COUNTALL. And COUNTALL |
| 1173 | + # should currently (under the no nulls semantics of Raco/Myria) |
| 1174 | + # translate to COUNT(something) |
| 1175 | + self._check_aggregate_functions_pushed( |
| 1176 | + 'count(*)', r'count[(][a-zA-Z.]+[)]', True) |
0 commit comments