Query-generation: Changes On Branch new-predicates

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch new-predicates Excluding Merge-Ins

This is equivalent to a diff from 0f0856db5a to 4162607284

2025-05-28
09:27		Merges IN and = predicate check-in: 0a0518ab14 user: mathos tags: trunk
09:24		Adds equality lower bound as an array. Leaf check-in: 4162607284 user: mathos tags: new-predicates
00:05		Finally. An stable version check-in: 48e17f1cde user: mathos tags: new-predicates
2025-05-26
11:22		Minor refactor to predicate class. Ticket [1e726428f6e719fb] check-in: c93b2b766c user: mathos tags: new-predicates
11:12		Fix to be able to save the CSV check-in: 0f0856db5a user: mathos tags: trunk
09:57		adds table_size to histogram check-in: 288ba9b582 user: mathos tags: trunk

Changes to data/histograms/histogram_tpcds.parquet.

cannot compute difference between binary files

Added params_config/search_params/tpcds.toml.

Added params_config/search_params/tpcds_dev.toml.

Added params_config/snowflake/tpcds.toml.

Changes to pyproject.toml.

Changes to src/query_generator/database_schemas/schemas.py.

Changes to src/query_generator/database_schemas/tpcds.py.

Changes to src/query_generator/duckdb_connection/binning.py.

Changes to src/query_generator/duckdb_connection/setup.py.

Changes to src/query_generator/join_based_query_generator/snowflake.py.

Changes to src/query_generator/join_based_query_generator/utils/subgraph_generator.py.

Changes to src/query_generator/main.py.

Changes to src/query_generator/predicate_generator/predicate_generator.py.

Changes to src/query_generator/tools/histograms.py.

Changes to src/query_generator/utils/definitions.py.

Changes to src/query_generator/utils/exceptions.py.

Added src/query_generator/utils/params.py.

Changes to tests/duckdb/test_binning.py.

Changes to tests/duckdb/test_duckdb_utils.py.

Changes to tests/file_management/test_read_histograms.py.

Changes to tests/query_generation/test_make_queries.py.

︙
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 ~~27 28 29 30~~ 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 ~~80 81 82~~ 83 84 85 86 87 88 89 90 91 ~~92 93 94 95 96~~ 97 98 99 100 101 102 103 104	9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113	- + + - + - - - - - + + - + - + + + + - + + + - - - + + + + - + - - - - - + + + + + + + - + + + +	QueryGenerator, ) from query_generator.join_based_query_generator.utils.query_writer import ( Writer, ) from query_generator.utils.definitions import ( BatchGeneratedQueryFeatures, ~~Dataset,~~ Extension, PredicateParameters, QueryGenerationParameters, ) from query_generator.utils.params import SearchParametersEndpoint @dataclass class SearchParameters: ~~~~data~~set: Da~~tase~~t~~ user_input: SearchParametersEndpoint scale_factor: int \| float con: duckdb.DuckDBPyConnection ~~max_hops: list[int]~~ ~~extra_predicates: list[int]~~ ~~row_retention_probability: list[float]~~ ~~unique_joins: bool~~ def get_result_from_duckdb(query: str, con: duckdb.DuckDBPyConnection) -> int: try: result = int(con.sql(query).fetchall()[0][0]) except duckdb.BinderException as e: print(f"Invalid query, exception: {e},\n{query}") return -1 return result ~~def get_total_iterations(search_params: SearchParameters) -> int:~~ def get_total_iterations(search_params: SearchParametersEndpoint) -> int: """Get the total number of iterations for the Snowflake binning process. Args: search_params (SearchParameters): The parameters for the Snowflake binning process. Returns: int: The total number of iterations. """ return ( len(search_params.max_hops) * len(search_params.extra_predicates) * len(search_params.row_retention_probability) * len(search_params.equality_lower_bound_probability) ) def run_snowflake_param_seach( search_params: SearchParameters, ) -> None: """Run the Snowflake binning process. Binning is equiwidth binning. Args: parameters (BinningSnowflakeParameters): The parameters for the Snowflake binning process. """ query_writer = Writer( ~~search_params.dataset,~~ search_params.user_input.dataset, Extension.SNOWFLAKE_SEARCH_PARAMS, ) rows: list[dict[str, str \| int \| float]] = [] ~~total_iterations = get_total_iterations(search_params)~~ total_iterations = get_total_iterations(search_params.user_input) batch_number = 0 seen_subgraphs: dict[int, bool] = {} for ( max_hops, extra_predicates, ~~for max_hops, extra_predicates,~~ row_retention_probability ~~in tqdm(~~ row_retention_probability, equality_lower_bound_probability, ) in tqdm( product( ~~search_params.max_hops, search_params.extra_predicates, search_params.row_retention_probability,~~ search_params.user_input.max_hops, search_params.user_input.extra_predicates, search_params.user_input.row_retention_probability, search_params.user_input.equality_lower_bound_probability, ), total=total_iterations, desc="Progress", ): batch_number += 1 query_generator = QueryGenerator( QueryGenerationParameters( ~~dataset=search_params.dataset,~~ dataset=search_params.user_input.dataset, max_hops=max_hops, max_queries_per_fact_table=10, max_queries_per_signature=2, keep_edge_prob~~=0.2~~, extra_predicates=extra_predicates, row_retention_probability=~~float(~~row_retention_probability), max_queries_per_fact_table=search_params.user_input.max_queries_per_fact_table, max_queries_per_signature=search_params.user_input.max_queries_per_signature, keep_edge_probability=search_params.user_input.keep_edge_probability, seen_subgraphs=seen_subgraphs, predicate_parameters=PredicateParameters( extra_predicates=extra_predicates, row_retention_probability=row_retention_probability, ~~seen_subgraphs=seen_subgraphs,~~ operator_weights=search_params.user_input.operator_weights, equality_lower_bound_probability=equality_lower_bound_probability, extra_values_for_in=search_params.user_input.extra_values_for_in, ), ) ) for query in query_generator.generate_queries(): selected_rows = get_result_from_duckdb(query.query, search_params.con) if selected_rows == -1: continue # invalid query
︙
124 125 126 127 128 129 130 ~~131~~ 132 133 134	133 134 135 136 137 138 139 140 141 142 143	- +	"predicate_number": query.predicate_number, "fact_table": query.fact_table, "max_hops": max_hops, "row_retention_probability": row_retention_probability, }, ) # Update the seen subgraphs with the new ones ~~if search_params.unique_joins:~~ if search_params.user_input.unique_joins: seen_subgraphs = query_generator.subgraph_generator.seen_subgraphs df_queries = pl.DataFrame(rows) query_writer.write_dataframe(df_queries)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16	- +	import os import duckdb from query_generator.utils.definitions import Dataset from query_generator.utils.exceptions import ( MissingScaleFactorError, PartiallySupportedDatasetError, ~~UnkwonDatasetError,~~ UnkownDatasetError, ) def load_and_install_libraries() -> None: duckdb.install_extension("TPCDS") duckdb.install_extension("TPCH") duckdb.load_extension("TPCDS")
︙
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50	25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50	- + - +	if dataset == Dataset.TPCDS: con.execute(f"CALL dsdgen(sf = {scale_factor})") elif dataset == Dataset.TPCH: con.execute(f"CALL dbgen(sf = {scale_factor})") elif dataset == Dataset.JOB: raise PartiallySupportedDatasetError(dataset.value) else: ~~raise UnkwonDatasetError(dataset)~~ raise UnkownDatasetError(dataset) def get_path( dataset: Dataset, scale_factor: float \| int \| None, ) -> str: if dataset in [Dataset.TPCDS, Dataset.TPCH]: return f"data/duckdb/{dataset.value}/{scale_factor}.db" if dataset == Dataset.JOB: return f"data/duckdb/{dataset.value}/job.db" ~~raise UnkwonDatasetError(dataset.value)~~ raise UnkownDatasetError(dataset.value) def setup_duckdb( dataset: Dataset, scale_factor: int \| float \| None = None, ) -> duckdb.DuckDBPyConnection: """Installs TPCDS and TPCH datasets in DuckDB.
︙

︙
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54	16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60	+ + + + + + - +	# fmt: on from query_generator.join_based_query_generator.utils.query_writer import ( Writer, ) from query_generator.predicate_generator.predicate_generator import ( HistogramDataType, PredicateEquality, PredicateGenerator, PredicateIn, PredicateRange, SupportedHistogramType, ) from query_generator.utils.definitions import ( Dataset, Extension, GeneratedQueryFeatures, PredicateParameters, QueryGenerationParameters, ) from query_generator.utils.exceptions import InvalidHistogramTypeError from query_generator.utils.utils import set_seed class QueryBuilder: def __init__( self, subgraph_generator: SubGraphGenerator, # TODO(Gabriel): http://localhost:8080/tktview/b9400c203a38f3aef46ec250d98563638ba7988b tables_schema: Any, dataset: Dataset, predicate_params: PredicateParameters, ) -> None: self.sub_graph_gen = subgraph_generator self.table_to_pypika_table = { i: Table(i, alias=tables_schema[i]["alias"]) for i in tables_schema } ~~self.predicate_gen = PredicateGenerator(dataset)~~ self.predicate_gen = PredicateGenerator(dataset, predicate_params) self.tables_schema = tables_schema def get_subgraph_tables( self, subgraph: list[ForeignKeyGraph.Edge], ) -> list[str]: return list(
︙
82 83 84 85 86 87 88 ~~89 90~~ 91 92 93 94 ~~95 96~~ 97 98 99 100 ~~101 102 103~~ ~~104 105 106 107~~ ~~108 109~~ ~~110~~ 111 ~~112 113~~ 114 115 ~~116 117 118~~ 119 ~~120~~ 121 122 ~~123 124~~ 125 126 127 ~~128~~ ~~129 130 131~~ 132 133 ~~134 135~~ ~~136~~ 137 ~~138 139~~ ~~140 141 142~~ 143 144 145 146 147 148 149 150 151 152 153 ~~154~~ 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 ~~177 178~~ 179 180 181 182 183 184 185	88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182	- - - - + - + + + + + + - - - + + + - - - - + + + - - + - - - + + - - - + + + - + - - + + - + - - - - - + - - - + + - - - + - + + - -	) return query def add_predicates( self, subgraph: list[ForeignKeyGraph.Edge], query: OracleQuery, ~~extra_predicates: int,~~ ~~row_retention_probability: float,~~ ) -> OracleQuery: subgraph_tables = self.get_subgraph_tables(subgraph) for predicate in self.predicate_gen.get_random_predicates( subgraph_tables, ~~extra_predicates,~~ ~~row_retention_probability,~~ ): if isinstance(predicate, PredicateRange): ~~qu~~ery =~~ self._add_range(query, predicate)~~ return self._add_range(query, predicate) if isinstance(predicate, PredicateEquality): return self._add_equality(query, predicate) if isinstance(predicate, PredicateIn): return self._add_in(query, predicate) raise InvalidHistogramTypeError(str(predicate.dtype)) return query ~~def _ad~~d_range~~( self, ~~query: OracleQuery, predicat~~e: Predicat~~eGenerator.Predicat~~e ) -> ~~OracleQuer~~y:~~ def _cast_if_needed( self, value: SupportedHistogramType, dtype: HistogramDataType ) -> Any: ~~if predicate.dtype in [HistogramDataType.INT, HistogramDataType.FLOAT]:~~ ~~return self._add_range_number(query, predicate)~~ if ~~predicate.~~dtype ~~in [~~HistogramDataType.DATE]: return se~~lf._add_range_~~date~~(query, predicate~~) """Cast the value to the appropriate type if needed.""" if dtype == HistogramDataType.DATE: return fn.Cast(value, "date") ~~if predicate.dtype in [HistogramDataType.STRING]:~~ return ~~self._add_range_string(query, predicate)~~ return value ~~raise InvalidHistogramTypeError(str(predicate.dtype))~~ ~~def _add_range~~_number~~( self, query: OracleQuery, predicate: Predicate~~Generator.Predicat~~e~~ def _add_range( self, query: OracleQuery, predicate: PredicateRange ) -> OracleQuery: return query.where( ~~self.table_to_pypika_table[predicate.table][predicate.column]~~ ~~>= predicate.min_value,~~ ~~).where(~~ self.table_to_pypika_table[predicate.table][predicate.column] >= self._cast_if_needed(predicate.min_value, predicate.dtype), ).where( self.table_to_pypika_table[predicate.table][predicate.column] ~~<= predicate.max_value,~~ <= self._cast_if_needed(predicate.max_value, predicate.dtype) ) ~~def _add_ra~~nge_date~~( self, query: OracleQuery, predicate: Predicate~~Gener~~a~~tor.Predicate~~~~ def _add_equality( self, query: OracleQuery, predicate: PredicateEquality ) -> OracleQuery: return query.where( self.table_to_pypika_table[predicate.table][predicate.column] ~~>= fn.Cast(~~predicate.~~min~~_value~~, "date"),~~ == predicate.equality_value ~~).where(~~ ~~self.table_to_pypika_table[predicate.table][predicate.column]~~ ~~<= fn.Cast(predicate.max_value, "date"),~~ ) ~~def _add_range_string(~~ self, query: OracleQuery, predicate: Predicate~~Gen~~er~~ator.Predicate~~ def _add_in(self, query: OracleQuery, predicate: PredicateIn) -> OracleQuery: ~~) -> OracleQuery:~~ return query.where( ~~self.table_to_pypika_table[predicate.table][predicate.column] >= predicate.min_value,~~ self.table_to_pypika_table[predicate.table][predicate.column].isin( [self._cast_if_needed(i, predicate.dtype) for i in predicate.in_values] ~~).where(~~ ~~self.table_to_pypika_table[predicate.table][predicate.column]~~ ~~<= predicate.max_value~~ ) ) class QueryGenerator: def __init__(self, params: QueryGenerationParameters) -> None: set_seed() self.params = params self.tables_schema, self.fact_tables = get_schema(params.dataset) self.foreign_key_graph = ForeignKeyGraph(self.tables_schema) self.subgraph_generator = SubGraphGenerator( self.foreign_key_graph, ~~params.keep_edge_prob,~~ params.keep_edge_probability, params.max_hops, params.seen_subgraphs, ) self.query_builder = QueryBuilder( self.subgraph_generator, self.tables_schema, params.dataset, params.predicate_parameters, ) def generate_queries(self) -> Iterator[GeneratedQueryFeatures]: for fact_table in self.fact_tables: for cnt, subgraph in enumerate( self.subgraph_generator.generate_subgraph( fact_table, self.params.max_queries_per_fact_table, ), ): query = self.query_builder.generate_query_from_subgraph(subgraph) for idx in range(1, self.params.max_queries_per_signature + 1): query = self.query_builder.add_predicates( subgraph, query, ~~self.params.extra_predicates,~~ ~~self.params.row_retention_probability,~~ ) yield GeneratedQueryFeatures( query=query.get_sql(), template_number=cnt, predicate_number=idx, fact_table=fact_table,
︙

︙
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30	16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33	- + + + +	get_equi_height_histogram, get_frequent_non_null_values, get_histogram_excluding_common_values, get_tables, ) from query_generator.utils.exceptions import InvalidHistogramTypeError ~~LIMIT_FOR_DISTINCT_VALUES = 1000~~ class MostCommonValuesColumns(Enum): VALUE = "value" COUNT = "count" class RedundantHistogramsDataType(Enum): """ This class was made for compatibility with old code that generated this histogram: https://github.com/udao-moo/udao-spark-optimizer-dev/blob/main
︙
87 88 89 90 91 92 93 94 95 ~~96 97 98~~ 99 100 101 102 103 104 105 106	90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105	- - - - + -	def get_most_common_values( con: duckdb.DuckDBPyConnection, table: str, column: str, common_value_size: int, ~~distinct_count: int,~~ ) -> list[RawDuckDBMostCommonValues]: ~~~~result: list[RawDuckDBMostCommonValues] = []~~ ~~if distinct_count < LIMIT_FOR_DISTINCT_VALUES:~~ re~~sult =~~ get_frequent_non_null_values(con, table, column, common_value_size)~~ return get_frequent_non_null_values(con, table, column, common_value_size) ~~return result~~ def get_histogram_array(histogram_params: HistogramParams) -> list[str]: histogram_raw = get_equi_height_histogram( histogram_params.con, histogram_params.table, histogram_params.column.column_name,
︙
114 115 116 117 118 119 120 ~~121 122 123~~ ~~124~~ 125 126 127 128 129 130 131	113 114 115 116 117 118 119 120 121 122 123 124 125 126 127	- - - + -	def get_histogram_array_excluding_common_values( histogram_params: HistogramParams, common_values_size: int, distinct_count: int, ) -> list[str]: histogram_array: list[RawDuckDBHistograms] = [] ~~~~if (~~ ~~distinct_count < LIMIT_FOR_DISTINCT_VALUES~~ ~~and~~ distinct_count > common_values_size~~ if distinct_count > common_values_size: ): histogram_array = get_histogram_excluding_common_values( histogram_params.con, histogram_params.table, histogram_params.column.column_name, histogram_params.histogram_size, common_values_size, )
︙
180 181 182 183 184 185 186 ~~187~~ 188 189 190 191 192 193 194 195 196 197 198 199 200 ~~201~~ 202 203 204 205 206 207 208	176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206	- + - + + +	if include_mvc: # Get most common values most_common_values = get_most_common_values( con, table, column.column_name, common_values_size, ~~distinct_count,~~ ) # Get histogram array excluding common values histogram_array_excluding_mcv = ( get_histogram_array_excluding_common_values( histogram_params, common_values_size, distinct_count, ) ) row_dict \|= { HistogramColumns.MOST_COMMON_VALUES.value: [ { ~~{"value": value.value, ~~"count": value.count}~~~~ MostCommonValuesColumns.VALUE.value: value.value, MostCommonValuesColumns.COUNT.value: value.count, } for value in most_common_values ], HistogramColumns.HISTOGRAM_MCV.value: histogram_array_excluding_mcv, } rows.append(row_dict) return pl.DataFrame(rows)
︙


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20	+ + - +	import tomllib from unittest import mock from cattrs import structure import polars as pl import pytest from query_generator.duckdb_connection.binning import ( SearchParameters, run_snowflake_param_seach, ) from query_generator.tools.cherry_pick_binning import make_bins_in_csv ~~from query_generator.utils.~~definition~~s import Da~~tase~~t~~ from query_generator.utils.params import SearchParametersEndpoint @pytest.mark.parametrize( "count_star, upper_bound, total_bins, expected_bin", [ (5, 10, 5, 3), (0, 10, 5, 0),
︙
40 41 42 43 44 45 46 ~~47 48~~ 49 ~~50 51~~ 52 53 54 55 56 57 58 59 60 61 62 63 64 ~~65 66 67 68~~ 69 70 71 72 73 74 75 76	42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93	- - + + - - + + + + + + + + + + + + + + + + + + + + + - - - - - +	" but got {computed_bin}" ) @pytest.mark.parametrize( "extra_predicates, expected_call_count, unique_joins", [ ~~(~~[1]~~, 120 * 1 + 14, ~~False~~), (~~[1]~~, 120 * 1 + 14, ~~True~~),~~ ("[1]", 120 * 1 + 14, "false"), ("[1]", 120 * 1 + 14, "true"), # Inventory is small and prooduces 14 queries total ~~([1, 2], 120 * 2 + 14, ~~True~~), ([1, 2], 120 * 2 + 14 * 2, ~~False~~),~~ ("[1, 2]", 120 * 2 + 14, "true"), ("[1, 2]", 120 * 2 + 14 * 2, "false"), ], ) def test_binning_calls(extra_predicates, expected_call_count, unique_joins): with mock.patch( "query_generator.duckdb_connection.binning.Writer.write_query_to_batch", ) as mock_writer: with mock.patch( "query_generator.duckdb_connection.binning.get_result_from_duckdb", ) as mock_connect: mock_connect.return_value = 0 data_toml = f""" dataset = "TPCDS" dev = true max_hops = [1] extra_predicates = {extra_predicates} row_retention_probability = [0.2] unique_joins = {unique_joins} max_queries_per_fact_table = 10 max_queries_per_signature = 2 keep_edge_probability = 0.2 equality_lower_bound_probability = [0] extra_values_for_in = 3 [operator_weights] operator_in = 1 operator_range = 3 operator_equal = 3 """ user_input = structure(tomllib.loads(data_toml), SearchParametersEndpoint) run_snowflake_param_seach( search_params=SearchParameters( scale_factor=0, ~~dataset=Dataset.TPCDS,~~ ~~max_hops=[1],~~ ~~extra_predicates=extra_predicates,~~ ~~row_retention_probability=[0.2],~~ con=None, ~~unique_joins=unique_joins,~~ user_input=user_input, ), ) assert mock_writer.call_count == expected_call_count, ( f"Expected {expected_call_count} calls to write_query, " f"but got {mock_writer.call_count}" )
















1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16	+ + + + + + + + + + + + + + + +	dataset = "TPCDS" dev = true max_hops = [1,2,4] extra_predicates = [1,3,5] row_retention_probability = [0.2, 0.3, 0.4, 0.6, 0.8, 0.85, 0.9, 1.0] unique_joins = true max_queries_per_fact_table = 10 max_queries_per_signature = 2 keep_edge_probability = 0.2 equality_lower_bound_probability = [0,0.1] extra_values_for_in = 3 [operator_weights] operator_in = 1 operator_range = 3 operator_equal = 3


















1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18	+ + + + + + + + + + + + + + + + + +	dataset = "TPCDS" max_hops = 3 max_queries_per_fact_table = 100 max_queries_per_signature = 1 keep_edge_probability = 0.2 [predicate_parameters] row_retention_probability = 0.2 extra_predicates = 3 equality_lower_bound_probability = 0.00 extra_values_for_in = 3 [predicate_parameters.operator_weights] operator_in = 1 operator_range = 3 operator_equal = 3






































































































1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102	+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +	import tomllib from dataclasses import dataclass from pathlib import Path from typing import TypeVar from cattrs import structure from query_generator.utils.definitions import ( Dataset, PredicateOperatorProbability, PredicateParameters, ) @dataclass class SearchParametersEndpoint: """ Represents the parameters used for configuring search queries, including query builder, subgraph, and predicate options. This class is designed to support both the `IN` and `=` statements in query generation. Attributes: dataset (Dataset): The dataset to be queried. dev (bool): Flag indicating whether to use development settings. max_queries_per_fact_table (int): Maximum number of queries per fact table. max_queries_per_signature (int): Maximum number of queries per signature. unique_joins (bool): Whether to enforce unique joins in the subgraph. max_hops (list[int]): Maximum number of hops allowed in the subgraph. keep_edge_probability (float): Probability of retaining an edge in the subgraph. extra_predicates (list[int]): Number of additional predicates to include in the query. row_retention_probability (list[float]): Probability of retaining a row for range predicates operator_weights (PredicateOperatorProbability): Probability distribution for predicate operators. equality_lower_bound_probability (float): Lower bound probability when using the `=` and the `IN` operators """ # Query Builder dataset: Dataset dev: bool max_queries_per_fact_table: int max_queries_per_signature: int # Subgraph unique_joins: bool max_hops: list[int] keep_edge_probability: float # Predicates extra_predicates: list[int] row_retention_probability: list[float] operator_weights: PredicateOperatorProbability equality_lower_bound_probability: list[float] extra_values_for_in: int @dataclass class SnowflakeEndpoint: """ Represents the parameters used for configuring query generation, including query builder, subgraph, and predicate options. Attributes: dataset (Dataset): The dataset to be used for query generation. max_queries_per_signature (int): Maximum number of queries to generate per signature. max_queries_per_fact_table (int): Maximum number of queries to generate per fact table. max_hops (int): Maximum number of hops allowed in the subgraph. keep_edge_probability (float): Probability of retaining an edge in the subgraph. extra_predicates (int): Number of extra predicates to add to the query. row_retention_probability (float): Probability of retaining a row after applying predicates. operator_weights (PredicateOperatorProbability): Probability distribution for predicate operators. equality_lower_bound_probability (float): Probability of using a lower bound for equality predicates. """ # Query builder dataset: Dataset max_queries_per_signature: int max_queries_per_fact_table: int # Subgraph max_hops: int keep_edge_probability: float # Predicates predicate_parameters: PredicateParameters T = TypeVar("T") def read_and_parse_toml(path: Path, cls: type[T]) -> T: toml_dict = tomllib.loads(path.read_text()) return structure(toml_dict, cls)



1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18	+ + - + -	import datetime from query_generator.duckdb_connection.setup import setup_duckdb from query_generator.duckdb_connection.utils import ( get_distinct_count, get_equi_height_histogram, get_frequent_non_null_values, ) from query_generator.tools.histograms import DuckDBHistogramParser from query_generator.utils.definitions import Dataset ~~from tests.utils import ~~is_date,~~ is_float~~ from tests.utils import is_float ~~import datetime~~ def test_distinct_values(): """Test the setup of DuckDB.""" # Setup DuckDB con = setup_duckdb(Dataset.TPCDS, 0.1) assert get_distinct_count(con, "call_center", "cc_call_center_sk") == 1
︙