Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch new-predicates Excluding Merge-Ins
This is equivalent to a diff from 0f0856db5a to 4162607284
2025-05-28
| ||
09:27 | Merges IN and = predicate check-in: 0a0518ab14 user: mathos tags: trunk | |
09:24 | Adds equality lower bound as an array. Leaf check-in: 4162607284 user: mathos tags: new-predicates | |
00:05 | Finally. An stable version check-in: 48e17f1cde user: mathos tags: new-predicates | |
2025-05-26
| ||
11:22 | Minor refactor to predicate class. Ticket [1e726428f6e719fb] check-in: c93b2b766c user: mathos tags: new-predicates | |
11:12 | Fix to be able to save the CSV check-in: 0f0856db5a user: mathos tags: trunk | |
09:57 | adds table_size to histogram check-in: 288ba9b582 user: mathos tags: trunk | |
Changes to data/histograms/histogram_tpcds.parquet.
cannot compute difference between binary files
Added params_config/search_params/tpcds.toml.
|
Added params_config/search_params/tpcds_dev.toml.
|
Added params_config/snowflake/tpcds.toml.
|
Changes to pyproject.toml.
︙ | |||
26 27 28 29 30 31 32 33 34 35 36 37 38 39 | 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | + | typer = ">=0.15.2,<0.16" rich = ">=14.0.0,<15" pypika = ">=0.48.9,<0.49" numpy = ">=2.2.5,<3" duckdb = ">=1.2.2,<2" polars = ">=1.27.1,<2" tqdm = "*" cattrs = ">=24.1.2,<25" [tool.pixi.feature.test.dependencies] pytest = ">=8.3.5,<9" [tool.pixi.feature.lint.dependencies] ruff = ">=0.11.7,<0.12" |
︙ |
Changes to src/query_generator/database_schemas/schemas.py.
1 2 3 4 5 6 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | - + - + | from typing import Any from query_generator.database_schemas.tpcds import get_tpcds_table_info from query_generator.database_schemas.tpch import get_tpch_table_info from query_generator.utils.definitions import Dataset from query_generator.utils.exceptions import ( PartiallySupportedDatasetError, |
Changes to src/query_generator/database_schemas/tpcds.py.
︙ | |||
436 437 438 439 440 441 442 | 436 437 438 439 440 441 442 443 444 445 446 447 448 449 | - | "s_floor_space": {"max": 9917607, "min": 5010719}, "s_gmt_offset": {"max": -5.0, "min": -6.0}, "s_market_id": {"max": 10, "min": 1}, "s_number_employees": {"max": 300, "min": 200}, "s_rec_end_date": {"max": "2001-03-12", "min": "1999-03-13"}, "s_rec_start_date": {"max": "2001-03-13", "min": "1997-03-13"}, "s_store_sk": {"max": 402, "min": 1}, |
︙ |
Changes to src/query_generator/duckdb_connection/binning.py.
︙ | |||
9 10 11 12 13 14 15 | 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | - + + - + - - - - - + + - + - + + + + - + + + - - - + + + + - + - - - - - + + + + + + + - + + + + | QueryGenerator, ) from query_generator.join_based_query_generator.utils.query_writer import ( Writer, ) from query_generator.utils.definitions import ( BatchGeneratedQueryFeatures, |
︙ | |||
124 125 126 127 128 129 130 | 133 134 135 136 137 138 139 140 141 142 143 | - + | "predicate_number": query.predicate_number, "fact_table": query.fact_table, "max_hops": max_hops, "row_retention_probability": row_retention_probability, }, ) # Update the seen subgraphs with the new ones |
Changes to src/query_generator/duckdb_connection/setup.py.
1 2 3 4 5 6 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | - + | import os import duckdb from query_generator.utils.definitions import Dataset from query_generator.utils.exceptions import ( MissingScaleFactorError, PartiallySupportedDatasetError, |
︙ | |||
25 26 27 28 29 30 31 | 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | - + - + | if dataset == Dataset.TPCDS: con.execute(f"CALL dsdgen(sf = {scale_factor})") elif dataset == Dataset.TPCH: con.execute(f"CALL dbgen(sf = {scale_factor})") elif dataset == Dataset.JOB: raise PartiallySupportedDatasetError(dataset.value) else: |
︙ |
Changes to src/query_generator/join_based_query_generator/snowflake.py.
︙ | |||
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | + + + + + + - + | # fmt: on from query_generator.join_based_query_generator.utils.query_writer import ( Writer, ) from query_generator.predicate_generator.predicate_generator import ( HistogramDataType, PredicateEquality, PredicateGenerator, PredicateIn, PredicateRange, SupportedHistogramType, ) from query_generator.utils.definitions import ( Dataset, Extension, GeneratedQueryFeatures, PredicateParameters, QueryGenerationParameters, ) from query_generator.utils.exceptions import InvalidHistogramTypeError from query_generator.utils.utils import set_seed class QueryBuilder: def __init__( self, subgraph_generator: SubGraphGenerator, # TODO(Gabriel): http://localhost:8080/tktview/b9400c203a38f3aef46ec250d98563638ba7988b tables_schema: Any, dataset: Dataset, predicate_params: PredicateParameters, ) -> None: self.sub_graph_gen = subgraph_generator self.table_to_pypika_table = { i: Table(i, alias=tables_schema[i]["alias"]) for i in tables_schema } |
︙ | |||
82 83 84 85 86 87 88 | 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | - - - - + - + + + + + + - - - + + + - - - - + + + - - + - - - + + - - - + + + - + - - + + - + - - - - - + - - - + + - - - + - + + - - | ) return query def add_predicates( self, subgraph: list[ForeignKeyGraph.Edge], query: OracleQuery, |
︙ |
Changes to src/query_generator/join_based_query_generator/utils/subgraph_generator.py.
︙ | |||
9 10 11 12 13 14 15 | 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | - + - + - + + - + | MAX_ATTEMPTS_FOR_NEW_SUBGRAPH = 1000 class SubGraphGenerator: def __init__( self, graph: ForeignKeyGraph, |
︙ |
Changes to src/query_generator/main.py.
︙ | |||
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | + + + + + - + - - - - - + - - + - - - - - - - - - + - - + - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - + + - - - - - + + + + + - - + - - - - - - + - - - - - - - - - - - - - - - - - + - - + - - - - - - - - - + - - - - - - - - - - + - - + - + - - - + + + - - - - - - + + + + - + - - - - | make_redundant_histograms, query_histograms, ) from query_generator.utils.definitions import ( Dataset, Extension, QueryGenerationParameters, ) from query_generator.utils.params import ( SearchParametersEndpoint, SnowflakeEndpoint, read_and_parse_toml, ) from query_generator.utils.show_messages import show_dev_warning from query_generator.utils.utils import validate_file_path app = typer.Typer(name="Query Generation") @app.command() def snowflake( |
︙ |
Changes to src/query_generator/predicate_generator/predicate_generator.py.
1 2 3 4 5 6 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | + + - - + + + + + + + + + - + + + + + + + + + + + + - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + + - - + + - + - + + + + + + + + + + + + + + - + - + - + + + + + + + + + + + + + + + + + + - - - + - + - - + + + - + + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + - + | import math import random from abc import ABC from collections.abc import Iterator from dataclasses import dataclass from enum import Enum import numpy as np import polars as pl |
Changes to src/query_generator/tools/histograms.py.
︙ | |||
16 17 18 19 20 21 22 | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | - + + + + | get_equi_height_histogram, get_frequent_non_null_values, get_histogram_excluding_common_values, get_tables, ) from query_generator.utils.exceptions import InvalidHistogramTypeError |
︙ | |||
87 88 89 90 91 92 93 | 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | - - - - + - | def get_most_common_values( con: duckdb.DuckDBPyConnection, table: str, column: str, common_value_size: int, |
︙ | |||
114 115 116 117 118 119 120 | 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | - - - + - | def get_histogram_array_excluding_common_values( histogram_params: HistogramParams, common_values_size: int, distinct_count: int, ) -> list[str]: histogram_array: list[RawDuckDBHistograms] = [] |
︙ | |||
180 181 182 183 184 185 186 | 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | - + - + + + | if include_mvc: # Get most common values most_common_values = get_most_common_values( con, table, column.column_name, common_values_size, |
︙ |
Changes to src/query_generator/utils/definitions.py.
︙ | |||
15 16 17 18 19 20 21 22 23 24 25 | 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | + + + + + + + + + + + + + + + + + + + + + + + - + - - - + | class Dataset(Enum): TPCDS = "TPCDS" TPCH = "TPCH" JOB = "JOB" @dataclass class PredicateOperatorProbability: """Probability of using a specific predicate operator. They are based on choice with weights for each operator. """ operator_in: float operator_equal: float operator_range: float @dataclass class PredicateParameters: extra_predicates: int row_retention_probability: float operator_weights: PredicateOperatorProbability equality_lower_bound_probability: float extra_values_for_in: int # TODO(Gabriel): http://localhost:8080/tktview/205e90a1fa @dataclass class QueryGenerationParameters: dataset: Dataset max_hops: int max_queries_per_signature: int max_queries_per_fact_table: int |
︙ |
Changes to src/query_generator/utils/exceptions.py.
︙ | |||
23 24 25 26 27 28 29 | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | - + | class DuplicateEdgesError(Exception): def __init__(self, table: str) -> None: super().__init__(f"Duplicate edges found for table {table}.") |
︙ |
Added src/query_generator/utils/params.py.