sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7 8from sqlglot.optimizer.annotate_types import TypeAnnotator 9 10from sqlglot import exp, generator, jsonpath, parser, tokens, transforms 11from sqlglot._typing import E 12from sqlglot.dialects.dialect import ( 13 Dialect, 14 NormalizationStrategy, 15 annotate_with_type_lambda, 16 arg_max_or_min_no_count, 17 binary_from_function, 18 date_add_interval_sql, 19 datestrtodate_sql, 20 build_formatted_time, 21 filter_array_using_unnest, 22 if_sql, 23 inline_array_unless_query, 24 max_or_greatest, 25 min_or_least, 26 no_ilike_sql, 27 build_date_delta_with_interval, 28 regexp_replace_sql, 29 rename_func, 30 sha256_sql, 31 timestrtotime_sql, 32 ts_or_ds_add_cast, 33 unit_to_var, 34 strposition_sql, 35 groupconcat_sql, 36) 37from sqlglot.helper import seq_get, split_num_words 38from sqlglot.tokens import TokenType 39from sqlglot.generator import unsupported_args 40 41if t.TYPE_CHECKING: 42 from sqlglot._typing import Lit 43 44 from sqlglot.optimizer.annotate_types import TypeAnnotator 45 46logger = logging.getLogger("sqlglot") 47 48 49JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 50 51DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 52 53 54def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 55 if not expression.find_ancestor(exp.From, exp.Join): 56 return self.values_sql(expression) 57 58 structs = [] 59 alias = expression.args.get("alias") 60 for tup in expression.find_all(exp.Tuple): 61 field_aliases = ( 62 alias.columns 63 if alias and alias.columns 64 else (f"_c{i}" for i in range(len(tup.expressions))) 65 ) 66 expressions = [ 67 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 68 for name, fld in zip(field_aliases, tup.expressions) 69 ] 70 structs.append(exp.Struct(expressions=expressions)) 71 72 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 73 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 74 return self.unnest_sql( 75 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 76 ) 77 78 79def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 80 this = expression.this 81 if isinstance(this, exp.Schema): 82 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 83 else: 84 this = self.sql(this) 85 return f"RETURNS {this}" 86 87 88def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 89 returns = expression.find(exp.ReturnsProperty) 90 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 91 expression.set("kind", "TABLE FUNCTION") 92 93 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 94 expression.set("expression", expression.expression.this) 95 96 return self.create_sql(expression) 97 98 99# https://issuetracker.google.com/issues/162294746 100# workaround for bigquery bug when grouping by an expression and then ordering 101# WITH x AS (SELECT 1 y) 102# SELECT y + 1 z 103# FROM x 104# GROUP BY x + 1 105# ORDER by z 106def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 107 if isinstance(expression, exp.Select): 108 group = expression.args.get("group") 109 order = expression.args.get("order") 110 111 if group and order: 112 aliases = { 113 select.this: select.args["alias"] 114 for select in expression.selects 115 if isinstance(select, exp.Alias) 116 } 117 118 for grouped in group.expressions: 119 if grouped.is_int: 120 continue 121 alias = aliases.get(grouped) 122 if alias: 123 grouped.replace(exp.column(alias)) 124 125 return expression 126 127 128def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 129 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 130 if isinstance(expression, exp.CTE) and expression.alias_column_names: 131 cte_query = expression.this 132 133 if cte_query.is_star: 134 logger.warning( 135 "Can't push down CTE column names for star queries. Run the query through" 136 " the optimizer or use 'qualify' to expand the star projections first." 137 ) 138 return expression 139 140 column_names = expression.alias_column_names 141 expression.args["alias"].set("columns", None) 142 143 for name, select in zip(column_names, cte_query.selects): 144 to_replace = select 145 146 if isinstance(select, exp.Alias): 147 select = select.this 148 149 # Inner aliases are shadowed by the CTE column names 150 to_replace.replace(exp.alias_(select, name)) 151 152 return expression 153 154 155def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 156 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 157 this.set("zone", seq_get(args, 2)) 158 return this 159 160 161def _build_timestamp(args: t.List) -> exp.Timestamp: 162 timestamp = exp.Timestamp.from_arg_list(args) 163 timestamp.set("with_tz", True) 164 return timestamp 165 166 167def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 168 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 169 return expr_type.from_arg_list(args) 170 171 172def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 173 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 174 arg = seq_get(args, 0) 175 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 176 177 178def _build_json_strip_nulls(args: t.List) -> exp.JSONStripNulls: 179 expression = exp.JSONStripNulls(this=seq_get(args, 0)) 180 181 for arg in args[1:]: 182 if isinstance(arg, exp.Kwarg): 183 expression.set(arg.this.name.lower(), arg) 184 else: 185 expression.set("expression", arg) 186 187 return expression 188 189 190def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 191 return self.sql( 192 exp.Exists( 193 this=exp.select("1") 194 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 195 .where(exp.column("_col").eq(expression.right)) 196 ) 197 ) 198 199 200def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 201 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 202 203 204def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 205 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 206 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 207 unit = unit_to_var(expression) 208 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 209 210 211def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 212 scale = expression.args.get("scale") 213 timestamp = expression.this 214 215 if scale in (None, exp.UnixToTime.SECONDS): 216 return self.func("TIMESTAMP_SECONDS", timestamp) 217 if scale == exp.UnixToTime.MILLIS: 218 return self.func("TIMESTAMP_MILLIS", timestamp) 219 if scale == exp.UnixToTime.MICROS: 220 return self.func("TIMESTAMP_MICROS", timestamp) 221 222 unix_seconds = exp.cast( 223 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 224 ) 225 return self.func("TIMESTAMP_SECONDS", unix_seconds) 226 227 228def _build_time(args: t.List) -> exp.Func: 229 if len(args) == 1: 230 return exp.TsOrDsToTime(this=args[0]) 231 if len(args) == 2: 232 return exp.Time.from_arg_list(args) 233 return exp.TimeFromParts.from_arg_list(args) 234 235 236def _build_datetime(args: t.List) -> exp.Func: 237 if len(args) == 1: 238 return exp.TsOrDsToDatetime.from_arg_list(args) 239 if len(args) == 2: 240 return exp.Datetime.from_arg_list(args) 241 return exp.TimestampFromParts.from_arg_list(args) 242 243 244def _build_regexp_extract( 245 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 246) -> t.Callable[[t.List], E]: 247 def _builder(args: t.List) -> E: 248 try: 249 group = re.compile(args[1].name).groups == 1 250 except re.error: 251 group = False 252 253 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 254 return expr_type( 255 this=seq_get(args, 0), 256 expression=seq_get(args, 1), 257 position=seq_get(args, 2), 258 occurrence=seq_get(args, 3), 259 group=exp.Literal.number(1) if group else default_group, 260 ) 261 262 return _builder 263 264 265def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 266 def _builder(args: t.List, dialect: Dialect) -> E: 267 if len(args) == 1: 268 # The default value for the JSONPath is '$' i.e all of the data 269 args.append(exp.Literal.string("$")) 270 return parser.build_extract_json_with_path(expr_type)(args, dialect) 271 272 return _builder 273 274 275def _str_to_datetime_sql( 276 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 277) -> str: 278 this = self.sql(expression, "this") 279 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 280 281 if expression.args.get("safe"): 282 fmt = self.format_time( 283 expression, 284 self.dialect.INVERSE_FORMAT_MAPPING, 285 self.dialect.INVERSE_FORMAT_TRIE, 286 ) 287 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 288 289 fmt = self.format_time(expression) 290 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 291 292 293def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 294 """ 295 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 296 +---------+---------+---------+------------+---------+ 297 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 298 +---------+---------+---------+------------+---------+ 299 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 300 +---------+---------+---------+------------+---------+ 301 """ 302 self._annotate_args(expression) 303 304 this: exp.Expression = expression.this 305 306 self._set_type( 307 expression, 308 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 309 ) 310 return expression 311 312 313def _annotate_by_args_with_coerce(self: TypeAnnotator, expression: E) -> E: 314 """ 315 +------------+------------+------------+-------------+---------+ 316 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 317 +------------+------------+------------+-------------+---------+ 318 | INT64 | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 319 | NUMERIC | NUMERIC | NUMERIC | BIGNUMERIC | FLOAT64 | 320 | BIGNUMERIC | BIGNUMERIC | BIGNUMERIC | BIGNUMERIC | FLOAT64 | 321 | FLOAT64 | FLOAT64 | FLOAT64 | FLOAT64 | FLOAT64 | 322 +------------+------------+------------+-------------+---------+ 323 """ 324 self._annotate_args(expression) 325 326 self._set_type(expression, self._maybe_coerce(expression.this.type, expression.expression.type)) 327 return expression 328 329 330def _annotate_by_args_approx_top(self: TypeAnnotator, expression: exp.ApproxTopK) -> exp.ApproxTopK: 331 self._annotate_args(expression) 332 333 struct_type = exp.DataType( 334 this=exp.DataType.Type.STRUCT, 335 expressions=[expression.this.type, exp.DataType(this=exp.DataType.Type.BIGINT)], 336 nested=True, 337 ) 338 self._set_type( 339 expression, 340 exp.DataType(this=exp.DataType.Type.ARRAY, expressions=[struct_type], nested=True), 341 ) 342 343 return expression 344 345 346@unsupported_args("ins_cost", "del_cost", "sub_cost") 347def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 348 max_dist = expression.args.get("max_dist") 349 if max_dist: 350 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 351 352 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 353 354 355def _build_levenshtein(args: t.List) -> exp.Levenshtein: 356 max_dist = seq_get(args, 2) 357 return exp.Levenshtein( 358 this=seq_get(args, 0), 359 expression=seq_get(args, 1), 360 max_dist=max_dist.expression if max_dist else None, 361 ) 362 363 364def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 365 def _builder(args: t.List) -> exp.TimeToStr: 366 return exp.TimeToStr( 367 this=expr_type(this=seq_get(args, 1)), 368 format=seq_get(args, 0), 369 zone=seq_get(args, 2), 370 ) 371 372 return _builder 373 374 375def _build_contains_substring(args: t.List) -> exp.Contains: 376 # Lowercase the operands in case of transpilation, as exp.Contains 377 # is case-sensitive on other dialects 378 this = exp.Lower(this=seq_get(args, 0)) 379 expr = exp.Lower(this=seq_get(args, 1)) 380 381 return exp.Contains(this=this, expression=expr, json_scope=seq_get(args, 2)) 382 383 384def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 385 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 386 upper = name.upper() 387 388 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 389 390 if dquote_escaping: 391 self._quote_json_path_key_using_brackets = False 392 393 sql = rename_func(upper)(self, expression) 394 395 if dquote_escaping: 396 self._quote_json_path_key_using_brackets = True 397 398 return sql 399 400 401def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 402 annotated = self._annotate_by_args(expression, "expressions") 403 404 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 405 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 406 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 407 annotated.type = exp.DataType.Type.VARCHAR 408 409 return annotated 410 411 412def _annotate_array(self: TypeAnnotator, expression: exp.Array) -> exp.Array: 413 array_args = expression.expressions 414 415 # BigQuery behaves as follows: 416 # 417 # SELECT t, TYPEOF(t) FROM (SELECT 'foo') AS t -- foo, STRUCT<STRING> 418 # SELECT ARRAY(SELECT 'foo'), TYPEOF(ARRAY(SELECT 'foo')) -- foo, ARRAY<STRING> 419 if ( 420 len(array_args) == 1 421 and isinstance(select := array_args[0].unnest(), exp.Select) 422 and (query_type := select.meta.get("query_type")) is not None 423 and query_type.is_type(exp.DataType.Type.STRUCT) 424 and len(query_type.expressions) == 1 425 and isinstance(col_def := query_type.expressions[0], exp.ColumnDef) 426 and (projection_type := col_def.kind) is not None 427 and not projection_type.is_type(exp.DataType.Type.UNKNOWN) 428 ): 429 array_type = exp.DataType( 430 this=exp.DataType.Type.ARRAY, 431 expressions=[projection_type.copy()], 432 nested=True, 433 ) 434 return self._annotate_with_type(expression, array_type) 435 436 return self._annotate_by_args(expression, "expressions", array=True) 437 438 439class BigQuery(Dialect): 440 WEEK_OFFSET = -1 441 UNNEST_COLUMN_ONLY = True 442 SUPPORTS_USER_DEFINED_TYPES = False 443 SUPPORTS_SEMI_ANTI_JOIN = False 444 LOG_BASE_FIRST = False 445 HEX_LOWERCASE = True 446 FORCE_EARLY_ALIAS_REF_EXPANSION = True 447 PRESERVE_ORIGINAL_NAMES = True 448 HEX_STRING_IS_INTEGER_TYPE = True 449 450 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 451 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 452 453 # bigquery udfs are case sensitive 454 NORMALIZE_FUNCTIONS = False 455 456 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 457 TIME_MAPPING = { 458 "%D": "%m/%d/%y", 459 "%E6S": "%S.%f", 460 "%e": "%-d", 461 } 462 463 FORMAT_MAPPING = { 464 "DD": "%d", 465 "MM": "%m", 466 "MON": "%b", 467 "MONTH": "%B", 468 "YYYY": "%Y", 469 "YY": "%y", 470 "HH": "%I", 471 "HH12": "%I", 472 "HH24": "%H", 473 "MI": "%M", 474 "SS": "%S", 475 "SSSSS": "%f", 476 "TZH": "%z", 477 } 478 479 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 480 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 481 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 482 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 483 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE", "_TABLE_SUFFIX", "_FILE_NAME"} 484 485 # All set operations require either a DISTINCT or ALL specifier 486 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 487 488 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 489 COERCES_TO = { 490 **TypeAnnotator.COERCES_TO, 491 exp.DataType.Type.BIGDECIMAL: {exp.DataType.Type.DOUBLE}, 492 } 493 COERCES_TO[exp.DataType.Type.DECIMAL] |= {exp.DataType.Type.BIGDECIMAL} 494 COERCES_TO[exp.DataType.Type.BIGINT] |= {exp.DataType.Type.BIGDECIMAL} 495 496 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 497 TYPE_TO_EXPRESSIONS = { 498 **Dialect.TYPE_TO_EXPRESSIONS, 499 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 500 } 501 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 502 503 ANNOTATORS = { 504 **Dialect.ANNOTATORS, 505 **{ 506 expr_type: annotate_with_type_lambda(data_type) 507 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 508 for expr_type in expressions 509 }, 510 **{ 511 expr_type: lambda self, e: _annotate_math_functions(self, e) 512 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 513 }, 514 **{ 515 expr_type: lambda self, e: self._annotate_by_args(e, "this") 516 for expr_type in ( 517 exp.Abs, 518 exp.ArgMax, 519 exp.ArgMin, 520 exp.DateTrunc, 521 exp.DatetimeTrunc, 522 exp.FirstValue, 523 exp.GroupConcat, 524 exp.IgnoreNulls, 525 exp.JSONExtract, 526 exp.Lead, 527 exp.Left, 528 exp.Lower, 529 exp.NthValue, 530 exp.Pad, 531 exp.PercentileDisc, 532 exp.RegexpExtract, 533 exp.RegexpReplace, 534 exp.Repeat, 535 exp.Replace, 536 exp.RespectNulls, 537 exp.Reverse, 538 exp.Right, 539 exp.SafeNegate, 540 exp.Sign, 541 exp.Substring, 542 exp.TimestampTrunc, 543 exp.Translate, 544 exp.Trim, 545 exp.Upper, 546 ) 547 }, 548 exp.Acos: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 549 exp.Acosh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 550 exp.Asin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 551 exp.Asinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 552 exp.Atan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 553 exp.Atanh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 554 exp.Atan2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 555 exp.ApproxTopSum: lambda self, e: _annotate_by_args_approx_top(self, e), 556 exp.ApproxTopK: lambda self, e: _annotate_by_args_approx_top(self, e), 557 exp.ApproxQuantiles: lambda self, e: self._annotate_by_args(e, "this", array=True), 558 exp.Array: _annotate_array, 559 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 560 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 561 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 562 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 563 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 564 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 565 exp.ByteLength: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 566 exp.ByteString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 567 exp.Cbrt: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 568 exp.CodePointsToBytes: lambda self, e: self._annotate_with_type( 569 e, exp.DataType.Type.BINARY 570 ), 571 exp.CodePointsToString: lambda self, e: self._annotate_with_type( 572 e, exp.DataType.Type.VARCHAR 573 ), 574 exp.Concat: _annotate_concat, 575 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 576 exp.Cot: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 577 exp.CosineDistance: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 578 exp.Coth: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 579 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 580 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 581 exp.Csc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 582 exp.Csch: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 583 exp.CumeDist: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 584 exp.DateFromUnixDate: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATE), 585 exp.DenseRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 586 exp.EuclideanDistance: lambda self, e: self._annotate_with_type( 587 e, exp.DataType.Type.DOUBLE 588 ), 589 exp.FarmFingerprint: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 590 exp.Unhex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 591 exp.Float64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 592 exp.Format: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 593 exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type( 594 e, exp.DataType.build("ARRAY<TIMESTAMP>", dialect="bigquery") 595 ), 596 exp.Grouping: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 597 exp.IsInf: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 598 exp.IsNan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 599 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 600 exp.JSONArrayAppend: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 601 exp.JSONArrayInsert: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 602 exp.JSONBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 603 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 604 e, exp.DataType.Type.VARCHAR 605 ), 606 exp.JSONExtractArray: lambda self, e: self._annotate_by_args(e, "this", array=True), 607 exp.JSONFormat: lambda self, e: self._annotate_with_type( 608 e, exp.DataType.Type.JSON if e.args.get("to_json") else exp.DataType.Type.VARCHAR 609 ), 610 exp.JSONKeysAtDepth: lambda self, e: self._annotate_with_type( 611 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 612 ), 613 exp.JSONObject: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 614 exp.JSONRemove: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 615 exp.JSONSet: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 616 exp.JSONStripNulls: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 617 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 618 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 619 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 620 ), 621 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 622 exp.LowerHex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 623 exp.LaxBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 624 exp.LaxFloat64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 625 exp.LaxInt64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 626 exp.LaxString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 627 exp.MD5Digest: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 628 exp.Normalize: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 629 exp.Ntile: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 630 exp.ParseTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 631 exp.ParseDatetime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATETIME), 632 exp.ParseBignumeric: lambda self, e: self._annotate_with_type( 633 e, exp.DataType.Type.BIGDECIMAL 634 ), 635 exp.ParseNumeric: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DECIMAL), 636 exp.PercentileCont: lambda self, e: _annotate_by_args_with_coerce(self, e), 637 exp.PercentRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 638 exp.Rank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 639 exp.RangeBucket: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 640 exp.RegexpExtractAll: lambda self, e: self._annotate_by_args(e, "this", array=True), 641 exp.RegexpInstr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 642 exp.RowNumber: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 643 exp.Rand: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 644 exp.SafeConvertBytesToString: lambda self, e: self._annotate_with_type( 645 e, exp.DataType.Type.VARCHAR 646 ), 647 exp.SafeAdd: lambda self, e: _annotate_by_args_with_coerce(self, e), 648 exp.SafeMultiply: lambda self, e: _annotate_by_args_with_coerce(self, e), 649 exp.SafeSubtract: lambda self, e: _annotate_by_args_with_coerce(self, e), 650 exp.Sec: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 651 exp.Sech: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 652 exp.Soundex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 653 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 654 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 655 exp.Sin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 656 exp.Sinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 657 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 658 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 659 e, exp.DataType.Type.DATETIME 660 ), 661 exp.TimeFromParts: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 662 exp.TimeTrunc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 663 exp.ToCodePoints: lambda self, e: self._annotate_with_type( 664 e, exp.DataType.build("ARRAY<BIGINT>", dialect="bigquery") 665 ), 666 exp.TsOrDsToTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 667 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 668 exp.Uuid: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 669 } 670 671 def normalize_identifier(self, expression: E) -> E: 672 if ( 673 isinstance(expression, exp.Identifier) 674 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 675 ): 676 parent = expression.parent 677 while isinstance(parent, exp.Dot): 678 parent = parent.parent 679 680 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 681 # by default. The following check uses a heuristic to detect tables based on whether 682 # they are qualified. This should generally be correct, because tables in BigQuery 683 # must be qualified with at least a dataset, unless @@dataset_id is set. 684 case_sensitive = ( 685 isinstance(parent, exp.UserDefinedFunction) 686 or ( 687 isinstance(parent, exp.Table) 688 and parent.db 689 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 690 ) 691 or expression.meta.get("is_table") 692 ) 693 if not case_sensitive: 694 expression.set("this", expression.this.lower()) 695 696 return t.cast(E, expression) 697 698 return super().normalize_identifier(expression) 699 700 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 701 VAR_TOKENS = { 702 TokenType.DASH, 703 TokenType.VAR, 704 } 705 706 class Tokenizer(tokens.Tokenizer): 707 QUOTES = ["'", '"', '"""', "'''"] 708 COMMENTS = ["--", "#", ("/*", "*/")] 709 IDENTIFIERS = ["`"] 710 STRING_ESCAPES = ["\\"] 711 712 HEX_STRINGS = [("0x", ""), ("0X", "")] 713 714 BYTE_STRINGS = [ 715 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 716 ] 717 718 RAW_STRINGS = [ 719 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 720 ] 721 722 NESTED_COMMENTS = False 723 724 KEYWORDS = { 725 **tokens.Tokenizer.KEYWORDS, 726 "ANY TYPE": TokenType.VARIANT, 727 "BEGIN": TokenType.COMMAND, 728 "BEGIN TRANSACTION": TokenType.BEGIN, 729 "BYTEINT": TokenType.INT, 730 "BYTES": TokenType.BINARY, 731 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 732 "DATETIME": TokenType.TIMESTAMP, 733 "DECLARE": TokenType.DECLARE, 734 "ELSEIF": TokenType.COMMAND, 735 "EXCEPTION": TokenType.COMMAND, 736 "EXPORT": TokenType.EXPORT, 737 "FLOAT64": TokenType.DOUBLE, 738 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 739 "LOOP": TokenType.COMMAND, 740 "MODEL": TokenType.MODEL, 741 "NOT DETERMINISTIC": TokenType.VOLATILE, 742 "RECORD": TokenType.STRUCT, 743 "REPEAT": TokenType.COMMAND, 744 "TIMESTAMP": TokenType.TIMESTAMPTZ, 745 "WHILE": TokenType.COMMAND, 746 } 747 KEYWORDS.pop("DIV") 748 KEYWORDS.pop("VALUES") 749 KEYWORDS.pop("/*+") 750 751 class Parser(parser.Parser): 752 PREFIXED_PIVOT_COLUMNS = True 753 LOG_DEFAULTS_TO_LN = True 754 SUPPORTS_IMPLICIT_UNNEST = True 755 JOINS_HAVE_EQUAL_PRECEDENCE = True 756 757 # BigQuery does not allow ASC/DESC to be used as an identifier 758 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 759 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 760 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 761 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 762 TokenType.ASC, 763 TokenType.DESC, 764 } 765 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 766 767 FUNCTIONS = { 768 **parser.Parser.FUNCTIONS, 769 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 770 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 771 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 772 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 773 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 774 "BOOL": exp.JSONBool.from_arg_list, 775 "CONTAINS_SUBSTR": _build_contains_substring, 776 "DATE": _build_date, 777 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 778 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 779 "DATE_TRUNC": lambda args: exp.DateTrunc( 780 unit=seq_get(args, 1), 781 this=seq_get(args, 0), 782 zone=seq_get(args, 2), 783 ), 784 "DATETIME": _build_datetime, 785 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 786 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 787 "DIV": binary_from_function(exp.IntDiv), 788 "EDIT_DISTANCE": _build_levenshtein, 789 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 790 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 791 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 792 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 793 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 794 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 795 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 796 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 797 "JSON_STRIP_NULLS": _build_json_strip_nulls, 798 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 799 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 800 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 801 "MD5": exp.MD5Digest.from_arg_list, 802 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 803 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 804 ), 805 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 806 "TO_HEX": _build_to_hex, 807 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 808 [seq_get(args, 1), seq_get(args, 0)] 809 ), 810 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 811 [seq_get(args, 1), seq_get(args, 0)] 812 ), 813 "PARSE_TIMESTAMP": _build_parse_timestamp, 814 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 815 [seq_get(args, 1), seq_get(args, 0)] 816 ), 817 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 818 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 819 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 820 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 821 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 822 ), 823 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 824 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 825 "SPLIT": lambda args: exp.Split( 826 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 827 this=seq_get(args, 0), 828 expression=seq_get(args, 1) or exp.Literal.string(","), 829 ), 830 "STRPOS": exp.StrPosition.from_arg_list, 831 "TIME": _build_time, 832 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 833 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 834 "TIMESTAMP": _build_timestamp, 835 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 836 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 837 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 838 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 839 ), 840 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 841 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 842 ), 843 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 844 "TO_JSON": lambda args: exp.JSONFormat( 845 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 846 ), 847 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 848 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 849 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 850 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 851 "FROM_HEX": exp.Unhex.from_arg_list, 852 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 853 } 854 855 FUNCTION_PARSERS = { 856 **parser.Parser.FUNCTION_PARSERS, 857 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 858 "JSON_ARRAY": lambda self: self.expression( 859 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 860 ), 861 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 862 "PREDICT": lambda self: self._parse_predict(), 863 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 864 "GENERATE_EMBEDDING": lambda self: self._parse_generate_embedding(), 865 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 866 } 867 FUNCTION_PARSERS.pop("TRIM") 868 869 NO_PAREN_FUNCTIONS = { 870 **parser.Parser.NO_PAREN_FUNCTIONS, 871 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 872 } 873 874 NESTED_TYPE_TOKENS = { 875 *parser.Parser.NESTED_TYPE_TOKENS, 876 TokenType.TABLE, 877 } 878 879 PROPERTY_PARSERS = { 880 **parser.Parser.PROPERTY_PARSERS, 881 "NOT DETERMINISTIC": lambda self: self.expression( 882 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 883 ), 884 "OPTIONS": lambda self: self._parse_with_property(), 885 } 886 887 CONSTRAINT_PARSERS = { 888 **parser.Parser.CONSTRAINT_PARSERS, 889 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 890 } 891 892 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 893 RANGE_PARSERS.pop(TokenType.OVERLAPS) 894 895 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 896 897 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 898 899 STATEMENT_PARSERS = { 900 **parser.Parser.STATEMENT_PARSERS, 901 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 902 TokenType.END: lambda self: self._parse_as_command(self._prev), 903 TokenType.FOR: lambda self: self._parse_for_in(), 904 TokenType.EXPORT: lambda self: self._parse_export_data(), 905 TokenType.DECLARE: lambda self: self._parse_declare(), 906 } 907 908 BRACKET_OFFSETS = { 909 "OFFSET": (0, False), 910 "ORDINAL": (1, False), 911 "SAFE_OFFSET": (0, True), 912 "SAFE_ORDINAL": (1, True), 913 } 914 915 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 916 index = self._index 917 this = self._parse_range() 918 self._match_text_seq("DO") 919 if self._match(TokenType.COMMAND): 920 self._retreat(index) 921 return self._parse_as_command(self._prev) 922 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 923 924 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 925 this = super()._parse_table_part(schema=schema) or self._parse_number() 926 927 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 928 if isinstance(this, exp.Identifier): 929 table_name = this.name 930 while self._match(TokenType.DASH, advance=False) and self._next: 931 start = self._curr 932 while self._is_connected() and not self._match_set( 933 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 934 ): 935 self._advance() 936 937 if start == self._curr: 938 break 939 940 table_name += self._find_sql(start, self._prev) 941 942 this = exp.Identifier( 943 this=table_name, quoted=this.args.get("quoted") 944 ).update_positions(this) 945 elif isinstance(this, exp.Literal): 946 table_name = this.name 947 948 if self._is_connected() and self._parse_var(any_token=True): 949 table_name += self._prev.text 950 951 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 952 953 return this 954 955 def _parse_table_parts( 956 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 957 ) -> exp.Table: 958 table = super()._parse_table_parts( 959 schema=schema, is_db_reference=is_db_reference, wildcard=True 960 ) 961 962 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 963 if not table.catalog: 964 if table.db: 965 previous_db = table.args["db"] 966 parts = table.db.split(".") 967 if len(parts) == 2 and not table.args["db"].quoted: 968 table.set( 969 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 970 ) 971 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 972 else: 973 previous_this = table.this 974 parts = table.name.split(".") 975 if len(parts) == 2 and not table.this.quoted: 976 table.set( 977 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 978 ) 979 table.set( 980 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 981 ) 982 983 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 984 alias = table.this 985 catalog, db, this, *rest = ( 986 exp.to_identifier(p, quoted=True) 987 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 988 ) 989 990 for part in (catalog, db, this): 991 if part: 992 part.update_positions(table.this) 993 994 if rest and this: 995 this = exp.Dot.build([this, *rest]) # type: ignore 996 997 table = exp.Table( 998 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 999 ) 1000 table.meta["quoted_table"] = True 1001 else: 1002 alias = None 1003 1004 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1005 # dataset, so if the project identifier is omitted we need to fix the ast so that 1006 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1007 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1008 # views, because it would seem like the "catalog" part is set, when it'd actually 1009 # be the region/dataset. Merging the two identifiers into a single one is done to 1010 # avoid producing a 4-part Table reference, which would cause issues in the schema 1011 # module, when there are 3-part table names mixed with information schema views. 1012 # 1013 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1014 table_parts = table.parts 1015 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1016 # We need to alias the table here to avoid breaking existing qualified columns. 1017 # This is expected to be safe, because if there's an actual alias coming up in 1018 # the token stream, it will overwrite this one. If there isn't one, we are only 1019 # exposing the name that can be used to reference the view explicitly (a no-op). 1020 exp.alias_( 1021 table, 1022 t.cast(exp.Identifier, alias or table_parts[-1]), 1023 table=True, 1024 copy=False, 1025 ) 1026 1027 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1028 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1029 line=table_parts[-2].meta.get("line"), 1030 col=table_parts[-1].meta.get("col"), 1031 start=table_parts[-2].meta.get("start"), 1032 end=table_parts[-1].meta.get("end"), 1033 ) 1034 table.set("this", new_this) 1035 table.set("db", seq_get(table_parts, -3)) 1036 table.set("catalog", seq_get(table_parts, -4)) 1037 1038 return table 1039 1040 def _parse_column(self) -> t.Optional[exp.Expression]: 1041 column = super()._parse_column() 1042 if isinstance(column, exp.Column): 1043 parts = column.parts 1044 if any("." in p.name for p in parts): 1045 catalog, db, table, this, *rest = ( 1046 exp.to_identifier(p, quoted=True) 1047 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1048 ) 1049 1050 if rest and this: 1051 this = exp.Dot.build([this, *rest]) # type: ignore 1052 1053 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1054 column.meta["quoted_column"] = True 1055 1056 return column 1057 1058 @t.overload 1059 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1060 1061 @t.overload 1062 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1063 1064 def _parse_json_object(self, agg=False): 1065 json_object = super()._parse_json_object() 1066 array_kv_pair = seq_get(json_object.expressions, 0) 1067 1068 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1069 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1070 if ( 1071 array_kv_pair 1072 and isinstance(array_kv_pair.this, exp.Array) 1073 and isinstance(array_kv_pair.expression, exp.Array) 1074 ): 1075 keys = array_kv_pair.this.expressions 1076 values = array_kv_pair.expression.expressions 1077 1078 json_object.set( 1079 "expressions", 1080 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1081 ) 1082 1083 return json_object 1084 1085 def _parse_bracket( 1086 self, this: t.Optional[exp.Expression] = None 1087 ) -> t.Optional[exp.Expression]: 1088 bracket = super()._parse_bracket(this) 1089 1090 if this is bracket: 1091 return bracket 1092 1093 if isinstance(bracket, exp.Bracket): 1094 for expression in bracket.expressions: 1095 name = expression.name.upper() 1096 1097 if name not in self.BRACKET_OFFSETS: 1098 break 1099 1100 offset, safe = self.BRACKET_OFFSETS[name] 1101 bracket.set("offset", offset) 1102 bracket.set("safe", safe) 1103 expression.replace(expression.expressions[0]) 1104 1105 return bracket 1106 1107 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1108 unnest = super()._parse_unnest(with_alias=with_alias) 1109 1110 if not unnest: 1111 return None 1112 1113 unnest_expr = seq_get(unnest.expressions, 0) 1114 if unnest_expr: 1115 from sqlglot.optimizer.annotate_types import annotate_types 1116 1117 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1118 1119 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1120 # in contrast to other dialects such as DuckDB which flattens only the array by default 1121 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1122 array_elem.is_type(exp.DataType.Type.STRUCT) 1123 for array_elem in unnest_expr._type.expressions 1124 ): 1125 unnest.set("explode_array", True) 1126 1127 return unnest 1128 1129 def _parse_make_interval(self) -> exp.MakeInterval: 1130 expr = exp.MakeInterval() 1131 1132 for arg_key in expr.arg_types: 1133 value = self._parse_lambda() 1134 1135 if not value: 1136 break 1137 1138 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1139 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1140 if isinstance(value, exp.Kwarg): 1141 arg_key = value.this.name 1142 1143 expr.set(arg_key, value) 1144 1145 self._match(TokenType.COMMA) 1146 1147 return expr 1148 1149 def _parse_predict(self) -> exp.Predict: 1150 self._match_text_seq("MODEL") 1151 this = self._parse_table() 1152 1153 self._match(TokenType.COMMA) 1154 self._match_text_seq("TABLE") 1155 1156 return self.expression( 1157 exp.Predict, 1158 this=this, 1159 expression=self._parse_table(), 1160 params_struct=self._match(TokenType.COMMA) and self._parse_bitwise(), 1161 ) 1162 1163 def _parse_generate_embedding(self) -> exp.GenerateEmbedding: 1164 self._match_text_seq("MODEL") 1165 this = self._parse_table() 1166 1167 self._match(TokenType.COMMA) 1168 self._match_text_seq("TABLE") 1169 1170 return self.expression( 1171 exp.GenerateEmbedding, 1172 this=this, 1173 expression=self._parse_table(), 1174 params_struct=self._match(TokenType.COMMA) and self._parse_bitwise(), 1175 ) 1176 1177 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1178 self._match(TokenType.TABLE) 1179 this = self._parse_table() 1180 1181 expr = self.expression(exp.FeaturesAtTime, this=this) 1182 1183 while self._match(TokenType.COMMA): 1184 arg = self._parse_lambda() 1185 1186 # Get the LHS of the Kwarg and set the arg to that value, e.g 1187 # "num_rows => 1" sets the expr's `num_rows` arg 1188 if arg: 1189 expr.set(arg.this.name, arg) 1190 1191 return expr 1192 1193 def _parse_vector_search(self) -> exp.VectorSearch: 1194 self._match(TokenType.TABLE) 1195 base_table = self._parse_table() 1196 1197 self._match(TokenType.COMMA) 1198 1199 column_to_search = self._parse_bitwise() 1200 self._match(TokenType.COMMA) 1201 1202 self._match(TokenType.TABLE) 1203 query_table = self._parse_table() 1204 1205 expr = self.expression( 1206 exp.VectorSearch, 1207 this=base_table, 1208 column_to_search=column_to_search, 1209 query_table=query_table, 1210 ) 1211 1212 while self._match(TokenType.COMMA): 1213 # query_column_to_search can be named argument or positional 1214 if self._match(TokenType.STRING, advance=False): 1215 query_column = self._parse_string() 1216 expr.set("query_column_to_search", query_column) 1217 else: 1218 arg = self._parse_lambda() 1219 if arg: 1220 expr.set(arg.this.name, arg) 1221 1222 return expr 1223 1224 def _parse_export_data(self) -> exp.Export: 1225 self._match_text_seq("DATA") 1226 1227 return self.expression( 1228 exp.Export, 1229 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1230 options=self._parse_properties(), 1231 this=self._match_text_seq("AS") and self._parse_select(), 1232 ) 1233 1234 class Generator(generator.Generator): 1235 INTERVAL_ALLOWS_PLURAL_FORM = False 1236 JOIN_HINTS = False 1237 QUERY_HINTS = False 1238 TABLE_HINTS = False 1239 LIMIT_FETCH = "LIMIT" 1240 RENAME_TABLE_WITH_DB = False 1241 NVL2_SUPPORTED = False 1242 UNNEST_WITH_ORDINALITY = False 1243 COLLATE_IS_FUNC = True 1244 LIMIT_ONLY_LITERALS = True 1245 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1246 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1247 JSON_KEY_VALUE_PAIR_SEP = "," 1248 NULL_ORDERING_SUPPORTED = False 1249 IGNORE_NULLS_IN_FUNC = True 1250 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1251 CAN_IMPLEMENT_ARRAY_ANY = True 1252 SUPPORTS_TO_NUMBER = False 1253 NAMED_PLACEHOLDER_TOKEN = "@" 1254 HEX_FUNC = "TO_HEX" 1255 WITH_PROPERTIES_PREFIX = "OPTIONS" 1256 SUPPORTS_EXPLODING_PROJECTIONS = False 1257 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1258 SUPPORTS_UNIX_SECONDS = True 1259 1260 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1261 1262 TS_OR_DS_TYPES = ( 1263 exp.TsOrDsToDatetime, 1264 exp.TsOrDsToTimestamp, 1265 exp.TsOrDsToTime, 1266 exp.TsOrDsToDate, 1267 ) 1268 1269 TRANSFORMS = { 1270 **generator.Generator.TRANSFORMS, 1271 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1272 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1273 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1274 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1275 exp.Array: inline_array_unless_query, 1276 exp.ArrayContains: _array_contains_sql, 1277 exp.ArrayFilter: filter_array_using_unnest, 1278 exp.ArrayRemove: filter_array_using_unnest, 1279 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1280 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1281 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1282 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1283 exp.ByteLength: rename_func("BYTE_LENGTH"), 1284 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1285 exp.CollateProperty: lambda self, e: ( 1286 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1287 if e.args.get("default") 1288 else f"COLLATE {self.sql(e, 'this')}" 1289 ), 1290 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1291 exp.CountIf: rename_func("COUNTIF"), 1292 exp.Create: _create_sql, 1293 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1294 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1295 exp.DateDiff: lambda self, e: self.func( 1296 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1297 ), 1298 exp.DateFromParts: rename_func("DATE"), 1299 exp.DateStrToDate: datestrtodate_sql, 1300 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1301 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1302 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1303 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1304 exp.FromTimeZone: lambda self, e: self.func( 1305 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1306 ), 1307 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1308 exp.GroupConcat: lambda self, e: groupconcat_sql( 1309 self, e, func_name="STRING_AGG", within_group=False 1310 ), 1311 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1312 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1313 exp.If: if_sql(false_value="NULL"), 1314 exp.ILike: no_ilike_sql, 1315 exp.IntDiv: rename_func("DIV"), 1316 exp.Int64: rename_func("INT64"), 1317 exp.JSONBool: rename_func("BOOL"), 1318 exp.JSONExtract: _json_extract_sql, 1319 exp.JSONExtractArray: _json_extract_sql, 1320 exp.JSONExtractScalar: _json_extract_sql, 1321 exp.JSONFormat: lambda self, e: self.func( 1322 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1323 e.this, 1324 e.args.get("options"), 1325 ), 1326 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1327 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1328 exp.Levenshtein: _levenshtein_sql, 1329 exp.Max: max_or_greatest, 1330 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1331 exp.MD5Digest: rename_func("MD5"), 1332 exp.Min: min_or_least, 1333 exp.Normalize: lambda self, e: self.func( 1334 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1335 e.this, 1336 e.args.get("form"), 1337 ), 1338 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1339 exp.RegexpExtract: lambda self, e: self.func( 1340 "REGEXP_EXTRACT", 1341 e.this, 1342 e.expression, 1343 e.args.get("position"), 1344 e.args.get("occurrence"), 1345 ), 1346 exp.RegexpExtractAll: lambda self, e: self.func( 1347 "REGEXP_EXTRACT_ALL", e.this, e.expression 1348 ), 1349 exp.RegexpReplace: regexp_replace_sql, 1350 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1351 exp.ReturnsProperty: _returnsproperty_sql, 1352 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1353 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1354 exp.ParseDatetime: lambda self, e: self.func( 1355 "PARSE_DATETIME", self.format_time(e), e.this 1356 ), 1357 exp.Select: transforms.preprocess( 1358 [ 1359 transforms.explode_projection_to_unnest(), 1360 transforms.unqualify_unnest, 1361 transforms.eliminate_distinct_on, 1362 _alias_ordered_group, 1363 transforms.eliminate_semi_and_anti_joins, 1364 ] 1365 ), 1366 exp.SHA: rename_func("SHA1"), 1367 exp.SHA2: sha256_sql, 1368 exp.StabilityProperty: lambda self, e: ( 1369 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1370 ), 1371 exp.String: rename_func("STRING"), 1372 exp.StrPosition: lambda self, e: ( 1373 strposition_sql( 1374 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1375 ) 1376 ), 1377 exp.StrToDate: _str_to_datetime_sql, 1378 exp.StrToTime: _str_to_datetime_sql, 1379 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1380 exp.TimeFromParts: rename_func("TIME"), 1381 exp.TimestampFromParts: rename_func("DATETIME"), 1382 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1383 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1384 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1385 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1386 exp.TimeStrToTime: timestrtotime_sql, 1387 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1388 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1389 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1390 exp.TsOrDsToTime: rename_func("TIME"), 1391 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1392 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1393 exp.Unhex: rename_func("FROM_HEX"), 1394 exp.UnixDate: rename_func("UNIX_DATE"), 1395 exp.UnixToTime: _unix_to_time_sql, 1396 exp.Uuid: lambda *_: "GENERATE_UUID()", 1397 exp.Values: _derived_table_values_to_unnest, 1398 exp.VariancePop: rename_func("VAR_POP"), 1399 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1400 } 1401 1402 SUPPORTED_JSON_PATH_PARTS = { 1403 exp.JSONPathKey, 1404 exp.JSONPathRoot, 1405 exp.JSONPathSubscript, 1406 } 1407 1408 TYPE_MAPPING = { 1409 **generator.Generator.TYPE_MAPPING, 1410 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1411 exp.DataType.Type.BIGINT: "INT64", 1412 exp.DataType.Type.BINARY: "BYTES", 1413 exp.DataType.Type.BLOB: "BYTES", 1414 exp.DataType.Type.BOOLEAN: "BOOL", 1415 exp.DataType.Type.CHAR: "STRING", 1416 exp.DataType.Type.DECIMAL: "NUMERIC", 1417 exp.DataType.Type.DOUBLE: "FLOAT64", 1418 exp.DataType.Type.FLOAT: "FLOAT64", 1419 exp.DataType.Type.INT: "INT64", 1420 exp.DataType.Type.NCHAR: "STRING", 1421 exp.DataType.Type.NVARCHAR: "STRING", 1422 exp.DataType.Type.SMALLINT: "INT64", 1423 exp.DataType.Type.TEXT: "STRING", 1424 exp.DataType.Type.TIMESTAMP: "DATETIME", 1425 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1426 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1427 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1428 exp.DataType.Type.TINYINT: "INT64", 1429 exp.DataType.Type.ROWVERSION: "BYTES", 1430 exp.DataType.Type.UUID: "STRING", 1431 exp.DataType.Type.VARBINARY: "BYTES", 1432 exp.DataType.Type.VARCHAR: "STRING", 1433 exp.DataType.Type.VARIANT: "ANY TYPE", 1434 } 1435 1436 PROPERTIES_LOCATION = { 1437 **generator.Generator.PROPERTIES_LOCATION, 1438 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1439 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1440 } 1441 1442 # WINDOW comes after QUALIFY 1443 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1444 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1445 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1446 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1447 } 1448 1449 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1450 RESERVED_KEYWORDS = { 1451 "all", 1452 "and", 1453 "any", 1454 "array", 1455 "as", 1456 "asc", 1457 "assert_rows_modified", 1458 "at", 1459 "between", 1460 "by", 1461 "case", 1462 "cast", 1463 "collate", 1464 "contains", 1465 "create", 1466 "cross", 1467 "cube", 1468 "current", 1469 "default", 1470 "define", 1471 "desc", 1472 "distinct", 1473 "else", 1474 "end", 1475 "enum", 1476 "escape", 1477 "except", 1478 "exclude", 1479 "exists", 1480 "extract", 1481 "false", 1482 "fetch", 1483 "following", 1484 "for", 1485 "from", 1486 "full", 1487 "group", 1488 "grouping", 1489 "groups", 1490 "hash", 1491 "having", 1492 "if", 1493 "ignore", 1494 "in", 1495 "inner", 1496 "intersect", 1497 "interval", 1498 "into", 1499 "is", 1500 "join", 1501 "lateral", 1502 "left", 1503 "like", 1504 "limit", 1505 "lookup", 1506 "merge", 1507 "natural", 1508 "new", 1509 "no", 1510 "not", 1511 "null", 1512 "nulls", 1513 "of", 1514 "on", 1515 "or", 1516 "order", 1517 "outer", 1518 "over", 1519 "partition", 1520 "preceding", 1521 "proto", 1522 "qualify", 1523 "range", 1524 "recursive", 1525 "respect", 1526 "right", 1527 "rollup", 1528 "rows", 1529 "select", 1530 "set", 1531 "some", 1532 "struct", 1533 "tablesample", 1534 "then", 1535 "to", 1536 "treat", 1537 "true", 1538 "unbounded", 1539 "union", 1540 "unnest", 1541 "using", 1542 "when", 1543 "where", 1544 "window", 1545 "with", 1546 "within", 1547 } 1548 1549 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1550 unit = expression.unit 1551 unit_sql = unit.name if unit.is_string else self.sql(unit) 1552 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1553 1554 def mod_sql(self, expression: exp.Mod) -> str: 1555 this = expression.this 1556 expr = expression.expression 1557 return self.func( 1558 "MOD", 1559 this.unnest() if isinstance(this, exp.Paren) else this, 1560 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1561 ) 1562 1563 def column_parts(self, expression: exp.Column) -> str: 1564 if expression.meta.get("quoted_column"): 1565 # If a column reference is of the form `dataset.table`.name, we need 1566 # to preserve the quoted table path, otherwise the reference breaks 1567 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1568 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1569 return f"{table_path}.{self.sql(expression, 'this')}" 1570 1571 return super().column_parts(expression) 1572 1573 def table_parts(self, expression: exp.Table) -> str: 1574 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1575 # we need to make sure the correct quoting is used in each case. 1576 # 1577 # For example, if there is a CTE x that clashes with a schema name, then the former will 1578 # return the table y in that schema, whereas the latter will return the CTE's y column: 1579 # 1580 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1581 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1582 if expression.meta.get("quoted_table"): 1583 table_parts = ".".join(p.name for p in expression.parts) 1584 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1585 1586 return super().table_parts(expression) 1587 1588 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1589 this = expression.this 1590 if isinstance(this, exp.TsOrDsToDatetime): 1591 func_name = "FORMAT_DATETIME" 1592 elif isinstance(this, exp.TsOrDsToTimestamp): 1593 func_name = "FORMAT_TIMESTAMP" 1594 elif isinstance(this, exp.TsOrDsToTime): 1595 func_name = "FORMAT_TIME" 1596 else: 1597 func_name = "FORMAT_DATE" 1598 1599 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1600 return self.func( 1601 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1602 ) 1603 1604 def eq_sql(self, expression: exp.EQ) -> str: 1605 # Operands of = cannot be NULL in BigQuery 1606 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1607 if not isinstance(expression.parent, exp.Update): 1608 return "NULL" 1609 1610 return self.binary(expression, "=") 1611 1612 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1613 parent = expression.parent 1614 1615 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1616 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1617 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1618 return self.func( 1619 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1620 ) 1621 1622 return super().attimezone_sql(expression) 1623 1624 def trycast_sql(self, expression: exp.TryCast) -> str: 1625 return self.cast_sql(expression, safe_prefix="SAFE_") 1626 1627 def bracket_sql(self, expression: exp.Bracket) -> str: 1628 this = expression.this 1629 expressions = expression.expressions 1630 1631 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1632 arg = expressions[0] 1633 if arg.type is None: 1634 from sqlglot.optimizer.annotate_types import annotate_types 1635 1636 arg = annotate_types(arg, dialect=self.dialect) 1637 1638 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1639 # BQ doesn't support bracket syntax with string values for structs 1640 return f"{self.sql(this)}.{arg.name}" 1641 1642 expressions_sql = self.expressions(expression, flat=True) 1643 offset = expression.args.get("offset") 1644 1645 if offset == 0: 1646 expressions_sql = f"OFFSET({expressions_sql})" 1647 elif offset == 1: 1648 expressions_sql = f"ORDINAL({expressions_sql})" 1649 elif offset is not None: 1650 self.unsupported(f"Unsupported array offset: {offset}") 1651 1652 if expression.args.get("safe"): 1653 expressions_sql = f"SAFE_{expressions_sql}" 1654 1655 return f"{self.sql(this)}[{expressions_sql}]" 1656 1657 def in_unnest_op(self, expression: exp.Unnest) -> str: 1658 return self.sql(expression) 1659 1660 def version_sql(self, expression: exp.Version) -> str: 1661 if expression.name == "TIMESTAMP": 1662 expression.set("this", "SYSTEM_TIME") 1663 return super().version_sql(expression) 1664 1665 def contains_sql(self, expression: exp.Contains) -> str: 1666 this = expression.this 1667 expr = expression.expression 1668 1669 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1670 this = this.this 1671 expr = expr.this 1672 1673 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1674 1675 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1676 this = expression.this 1677 1678 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1679 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1680 # because they aren't literals and so the above syntax is invalid BigQuery. 1681 if isinstance(this, exp.Array): 1682 elem = seq_get(this.expressions, 0) 1683 if not (elem and elem.find(exp.Query)): 1684 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1685 1686 return super().cast_sql(expression, safe_prefix=safe_prefix) 1687 1688 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1689 variables = self.expressions(expression, "this") 1690 default = self.sql(expression, "default") 1691 default = f" DEFAULT {default}" if default else "" 1692 kind = self.sql(expression, "kind") 1693 kind = f" {kind}" if kind else "" 1694 1695 return f"{variables}{kind}{default}"
440class BigQuery(Dialect): 441 WEEK_OFFSET = -1 442 UNNEST_COLUMN_ONLY = True 443 SUPPORTS_USER_DEFINED_TYPES = False 444 SUPPORTS_SEMI_ANTI_JOIN = False 445 LOG_BASE_FIRST = False 446 HEX_LOWERCASE = True 447 FORCE_EARLY_ALIAS_REF_EXPANSION = True 448 PRESERVE_ORIGINAL_NAMES = True 449 HEX_STRING_IS_INTEGER_TYPE = True 450 451 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 452 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 453 454 # bigquery udfs are case sensitive 455 NORMALIZE_FUNCTIONS = False 456 457 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 458 TIME_MAPPING = { 459 "%D": "%m/%d/%y", 460 "%E6S": "%S.%f", 461 "%e": "%-d", 462 } 463 464 FORMAT_MAPPING = { 465 "DD": "%d", 466 "MM": "%m", 467 "MON": "%b", 468 "MONTH": "%B", 469 "YYYY": "%Y", 470 "YY": "%y", 471 "HH": "%I", 472 "HH12": "%I", 473 "HH24": "%H", 474 "MI": "%M", 475 "SS": "%S", 476 "SSSSS": "%f", 477 "TZH": "%z", 478 } 479 480 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 481 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 482 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 483 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 484 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE", "_TABLE_SUFFIX", "_FILE_NAME"} 485 486 # All set operations require either a DISTINCT or ALL specifier 487 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 488 489 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 490 COERCES_TO = { 491 **TypeAnnotator.COERCES_TO, 492 exp.DataType.Type.BIGDECIMAL: {exp.DataType.Type.DOUBLE}, 493 } 494 COERCES_TO[exp.DataType.Type.DECIMAL] |= {exp.DataType.Type.BIGDECIMAL} 495 COERCES_TO[exp.DataType.Type.BIGINT] |= {exp.DataType.Type.BIGDECIMAL} 496 497 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 498 TYPE_TO_EXPRESSIONS = { 499 **Dialect.TYPE_TO_EXPRESSIONS, 500 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 501 } 502 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 503 504 ANNOTATORS = { 505 **Dialect.ANNOTATORS, 506 **{ 507 expr_type: annotate_with_type_lambda(data_type) 508 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 509 for expr_type in expressions 510 }, 511 **{ 512 expr_type: lambda self, e: _annotate_math_functions(self, e) 513 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 514 }, 515 **{ 516 expr_type: lambda self, e: self._annotate_by_args(e, "this") 517 for expr_type in ( 518 exp.Abs, 519 exp.ArgMax, 520 exp.ArgMin, 521 exp.DateTrunc, 522 exp.DatetimeTrunc, 523 exp.FirstValue, 524 exp.GroupConcat, 525 exp.IgnoreNulls, 526 exp.JSONExtract, 527 exp.Lead, 528 exp.Left, 529 exp.Lower, 530 exp.NthValue, 531 exp.Pad, 532 exp.PercentileDisc, 533 exp.RegexpExtract, 534 exp.RegexpReplace, 535 exp.Repeat, 536 exp.Replace, 537 exp.RespectNulls, 538 exp.Reverse, 539 exp.Right, 540 exp.SafeNegate, 541 exp.Sign, 542 exp.Substring, 543 exp.TimestampTrunc, 544 exp.Translate, 545 exp.Trim, 546 exp.Upper, 547 ) 548 }, 549 exp.Acos: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 550 exp.Acosh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 551 exp.Asin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 552 exp.Asinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 553 exp.Atan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 554 exp.Atanh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 555 exp.Atan2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 556 exp.ApproxTopSum: lambda self, e: _annotate_by_args_approx_top(self, e), 557 exp.ApproxTopK: lambda self, e: _annotate_by_args_approx_top(self, e), 558 exp.ApproxQuantiles: lambda self, e: self._annotate_by_args(e, "this", array=True), 559 exp.Array: _annotate_array, 560 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 561 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 562 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 563 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 564 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 565 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 566 exp.ByteLength: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 567 exp.ByteString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 568 exp.Cbrt: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 569 exp.CodePointsToBytes: lambda self, e: self._annotate_with_type( 570 e, exp.DataType.Type.BINARY 571 ), 572 exp.CodePointsToString: lambda self, e: self._annotate_with_type( 573 e, exp.DataType.Type.VARCHAR 574 ), 575 exp.Concat: _annotate_concat, 576 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 577 exp.Cot: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 578 exp.CosineDistance: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 579 exp.Coth: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 580 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 581 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 582 exp.Csc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 583 exp.Csch: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 584 exp.CumeDist: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 585 exp.DateFromUnixDate: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATE), 586 exp.DenseRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 587 exp.EuclideanDistance: lambda self, e: self._annotate_with_type( 588 e, exp.DataType.Type.DOUBLE 589 ), 590 exp.FarmFingerprint: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 591 exp.Unhex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 592 exp.Float64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 593 exp.Format: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 594 exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type( 595 e, exp.DataType.build("ARRAY<TIMESTAMP>", dialect="bigquery") 596 ), 597 exp.Grouping: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 598 exp.IsInf: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 599 exp.IsNan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 600 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 601 exp.JSONArrayAppend: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 602 exp.JSONArrayInsert: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 603 exp.JSONBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 604 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 605 e, exp.DataType.Type.VARCHAR 606 ), 607 exp.JSONExtractArray: lambda self, e: self._annotate_by_args(e, "this", array=True), 608 exp.JSONFormat: lambda self, e: self._annotate_with_type( 609 e, exp.DataType.Type.JSON if e.args.get("to_json") else exp.DataType.Type.VARCHAR 610 ), 611 exp.JSONKeysAtDepth: lambda self, e: self._annotate_with_type( 612 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 613 ), 614 exp.JSONObject: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 615 exp.JSONRemove: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 616 exp.JSONSet: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 617 exp.JSONStripNulls: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 618 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 619 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 620 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 621 ), 622 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 623 exp.LowerHex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 624 exp.LaxBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 625 exp.LaxFloat64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 626 exp.LaxInt64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 627 exp.LaxString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 628 exp.MD5Digest: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 629 exp.Normalize: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 630 exp.Ntile: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 631 exp.ParseTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 632 exp.ParseDatetime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATETIME), 633 exp.ParseBignumeric: lambda self, e: self._annotate_with_type( 634 e, exp.DataType.Type.BIGDECIMAL 635 ), 636 exp.ParseNumeric: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DECIMAL), 637 exp.PercentileCont: lambda self, e: _annotate_by_args_with_coerce(self, e), 638 exp.PercentRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 639 exp.Rank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 640 exp.RangeBucket: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 641 exp.RegexpExtractAll: lambda self, e: self._annotate_by_args(e, "this", array=True), 642 exp.RegexpInstr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 643 exp.RowNumber: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 644 exp.Rand: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 645 exp.SafeConvertBytesToString: lambda self, e: self._annotate_with_type( 646 e, exp.DataType.Type.VARCHAR 647 ), 648 exp.SafeAdd: lambda self, e: _annotate_by_args_with_coerce(self, e), 649 exp.SafeMultiply: lambda self, e: _annotate_by_args_with_coerce(self, e), 650 exp.SafeSubtract: lambda self, e: _annotate_by_args_with_coerce(self, e), 651 exp.Sec: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 652 exp.Sech: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 653 exp.Soundex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 654 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 655 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 656 exp.Sin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 657 exp.Sinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 658 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 659 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 660 e, exp.DataType.Type.DATETIME 661 ), 662 exp.TimeFromParts: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 663 exp.TimeTrunc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 664 exp.ToCodePoints: lambda self, e: self._annotate_with_type( 665 e, exp.DataType.build("ARRAY<BIGINT>", dialect="bigquery") 666 ), 667 exp.TsOrDsToTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 668 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 669 exp.Uuid: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 670 } 671 672 def normalize_identifier(self, expression: E) -> E: 673 if ( 674 isinstance(expression, exp.Identifier) 675 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 676 ): 677 parent = expression.parent 678 while isinstance(parent, exp.Dot): 679 parent = parent.parent 680 681 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 682 # by default. The following check uses a heuristic to detect tables based on whether 683 # they are qualified. This should generally be correct, because tables in BigQuery 684 # must be qualified with at least a dataset, unless @@dataset_id is set. 685 case_sensitive = ( 686 isinstance(parent, exp.UserDefinedFunction) 687 or ( 688 isinstance(parent, exp.Table) 689 and parent.db 690 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 691 ) 692 or expression.meta.get("is_table") 693 ) 694 if not case_sensitive: 695 expression.set("this", expression.this.lower()) 696 697 return t.cast(E, expression) 698 699 return super().normalize_identifier(expression) 700 701 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 702 VAR_TOKENS = { 703 TokenType.DASH, 704 TokenType.VAR, 705 } 706 707 class Tokenizer(tokens.Tokenizer): 708 QUOTES = ["'", '"', '"""', "'''"] 709 COMMENTS = ["--", "#", ("/*", "*/")] 710 IDENTIFIERS = ["`"] 711 STRING_ESCAPES = ["\\"] 712 713 HEX_STRINGS = [("0x", ""), ("0X", "")] 714 715 BYTE_STRINGS = [ 716 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 717 ] 718 719 RAW_STRINGS = [ 720 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 721 ] 722 723 NESTED_COMMENTS = False 724 725 KEYWORDS = { 726 **tokens.Tokenizer.KEYWORDS, 727 "ANY TYPE": TokenType.VARIANT, 728 "BEGIN": TokenType.COMMAND, 729 "BEGIN TRANSACTION": TokenType.BEGIN, 730 "BYTEINT": TokenType.INT, 731 "BYTES": TokenType.BINARY, 732 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 733 "DATETIME": TokenType.TIMESTAMP, 734 "DECLARE": TokenType.DECLARE, 735 "ELSEIF": TokenType.COMMAND, 736 "EXCEPTION": TokenType.COMMAND, 737 "EXPORT": TokenType.EXPORT, 738 "FLOAT64": TokenType.DOUBLE, 739 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 740 "LOOP": TokenType.COMMAND, 741 "MODEL": TokenType.MODEL, 742 "NOT DETERMINISTIC": TokenType.VOLATILE, 743 "RECORD": TokenType.STRUCT, 744 "REPEAT": TokenType.COMMAND, 745 "TIMESTAMP": TokenType.TIMESTAMPTZ, 746 "WHILE": TokenType.COMMAND, 747 } 748 KEYWORDS.pop("DIV") 749 KEYWORDS.pop("VALUES") 750 KEYWORDS.pop("/*+") 751 752 class Parser(parser.Parser): 753 PREFIXED_PIVOT_COLUMNS = True 754 LOG_DEFAULTS_TO_LN = True 755 SUPPORTS_IMPLICIT_UNNEST = True 756 JOINS_HAVE_EQUAL_PRECEDENCE = True 757 758 # BigQuery does not allow ASC/DESC to be used as an identifier 759 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 760 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 761 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 762 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 763 TokenType.ASC, 764 TokenType.DESC, 765 } 766 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 767 768 FUNCTIONS = { 769 **parser.Parser.FUNCTIONS, 770 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 771 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 772 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 773 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 774 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 775 "BOOL": exp.JSONBool.from_arg_list, 776 "CONTAINS_SUBSTR": _build_contains_substring, 777 "DATE": _build_date, 778 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 779 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 780 "DATE_TRUNC": lambda args: exp.DateTrunc( 781 unit=seq_get(args, 1), 782 this=seq_get(args, 0), 783 zone=seq_get(args, 2), 784 ), 785 "DATETIME": _build_datetime, 786 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 787 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 788 "DIV": binary_from_function(exp.IntDiv), 789 "EDIT_DISTANCE": _build_levenshtein, 790 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 791 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 792 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 793 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 794 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 795 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 796 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 797 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 798 "JSON_STRIP_NULLS": _build_json_strip_nulls, 799 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 800 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 801 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 802 "MD5": exp.MD5Digest.from_arg_list, 803 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 804 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 805 ), 806 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 807 "TO_HEX": _build_to_hex, 808 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 809 [seq_get(args, 1), seq_get(args, 0)] 810 ), 811 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 812 [seq_get(args, 1), seq_get(args, 0)] 813 ), 814 "PARSE_TIMESTAMP": _build_parse_timestamp, 815 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 816 [seq_get(args, 1), seq_get(args, 0)] 817 ), 818 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 819 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 820 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 821 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 822 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 823 ), 824 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 825 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 826 "SPLIT": lambda args: exp.Split( 827 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 828 this=seq_get(args, 0), 829 expression=seq_get(args, 1) or exp.Literal.string(","), 830 ), 831 "STRPOS": exp.StrPosition.from_arg_list, 832 "TIME": _build_time, 833 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 834 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 835 "TIMESTAMP": _build_timestamp, 836 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 837 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 838 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 839 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 840 ), 841 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 842 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 843 ), 844 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 845 "TO_JSON": lambda args: exp.JSONFormat( 846 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 847 ), 848 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 849 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 850 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 851 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 852 "FROM_HEX": exp.Unhex.from_arg_list, 853 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 854 } 855 856 FUNCTION_PARSERS = { 857 **parser.Parser.FUNCTION_PARSERS, 858 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 859 "JSON_ARRAY": lambda self: self.expression( 860 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 861 ), 862 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 863 "PREDICT": lambda self: self._parse_predict(), 864 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 865 "GENERATE_EMBEDDING": lambda self: self._parse_generate_embedding(), 866 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 867 } 868 FUNCTION_PARSERS.pop("TRIM") 869 870 NO_PAREN_FUNCTIONS = { 871 **parser.Parser.NO_PAREN_FUNCTIONS, 872 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 873 } 874 875 NESTED_TYPE_TOKENS = { 876 *parser.Parser.NESTED_TYPE_TOKENS, 877 TokenType.TABLE, 878 } 879 880 PROPERTY_PARSERS = { 881 **parser.Parser.PROPERTY_PARSERS, 882 "NOT DETERMINISTIC": lambda self: self.expression( 883 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 884 ), 885 "OPTIONS": lambda self: self._parse_with_property(), 886 } 887 888 CONSTRAINT_PARSERS = { 889 **parser.Parser.CONSTRAINT_PARSERS, 890 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 891 } 892 893 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 894 RANGE_PARSERS.pop(TokenType.OVERLAPS) 895 896 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 897 898 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 899 900 STATEMENT_PARSERS = { 901 **parser.Parser.STATEMENT_PARSERS, 902 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 903 TokenType.END: lambda self: self._parse_as_command(self._prev), 904 TokenType.FOR: lambda self: self._parse_for_in(), 905 TokenType.EXPORT: lambda self: self._parse_export_data(), 906 TokenType.DECLARE: lambda self: self._parse_declare(), 907 } 908 909 BRACKET_OFFSETS = { 910 "OFFSET": (0, False), 911 "ORDINAL": (1, False), 912 "SAFE_OFFSET": (0, True), 913 "SAFE_ORDINAL": (1, True), 914 } 915 916 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 917 index = self._index 918 this = self._parse_range() 919 self._match_text_seq("DO") 920 if self._match(TokenType.COMMAND): 921 self._retreat(index) 922 return self._parse_as_command(self._prev) 923 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 924 925 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 926 this = super()._parse_table_part(schema=schema) or self._parse_number() 927 928 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 929 if isinstance(this, exp.Identifier): 930 table_name = this.name 931 while self._match(TokenType.DASH, advance=False) and self._next: 932 start = self._curr 933 while self._is_connected() and not self._match_set( 934 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 935 ): 936 self._advance() 937 938 if start == self._curr: 939 break 940 941 table_name += self._find_sql(start, self._prev) 942 943 this = exp.Identifier( 944 this=table_name, quoted=this.args.get("quoted") 945 ).update_positions(this) 946 elif isinstance(this, exp.Literal): 947 table_name = this.name 948 949 if self._is_connected() and self._parse_var(any_token=True): 950 table_name += self._prev.text 951 952 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 953 954 return this 955 956 def _parse_table_parts( 957 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 958 ) -> exp.Table: 959 table = super()._parse_table_parts( 960 schema=schema, is_db_reference=is_db_reference, wildcard=True 961 ) 962 963 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 964 if not table.catalog: 965 if table.db: 966 previous_db = table.args["db"] 967 parts = table.db.split(".") 968 if len(parts) == 2 and not table.args["db"].quoted: 969 table.set( 970 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 971 ) 972 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 973 else: 974 previous_this = table.this 975 parts = table.name.split(".") 976 if len(parts) == 2 and not table.this.quoted: 977 table.set( 978 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 979 ) 980 table.set( 981 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 982 ) 983 984 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 985 alias = table.this 986 catalog, db, this, *rest = ( 987 exp.to_identifier(p, quoted=True) 988 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 989 ) 990 991 for part in (catalog, db, this): 992 if part: 993 part.update_positions(table.this) 994 995 if rest and this: 996 this = exp.Dot.build([this, *rest]) # type: ignore 997 998 table = exp.Table( 999 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1000 ) 1001 table.meta["quoted_table"] = True 1002 else: 1003 alias = None 1004 1005 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1006 # dataset, so if the project identifier is omitted we need to fix the ast so that 1007 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1008 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1009 # views, because it would seem like the "catalog" part is set, when it'd actually 1010 # be the region/dataset. Merging the two identifiers into a single one is done to 1011 # avoid producing a 4-part Table reference, which would cause issues in the schema 1012 # module, when there are 3-part table names mixed with information schema views. 1013 # 1014 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1015 table_parts = table.parts 1016 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1017 # We need to alias the table here to avoid breaking existing qualified columns. 1018 # This is expected to be safe, because if there's an actual alias coming up in 1019 # the token stream, it will overwrite this one. If there isn't one, we are only 1020 # exposing the name that can be used to reference the view explicitly (a no-op). 1021 exp.alias_( 1022 table, 1023 t.cast(exp.Identifier, alias or table_parts[-1]), 1024 table=True, 1025 copy=False, 1026 ) 1027 1028 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1029 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1030 line=table_parts[-2].meta.get("line"), 1031 col=table_parts[-1].meta.get("col"), 1032 start=table_parts[-2].meta.get("start"), 1033 end=table_parts[-1].meta.get("end"), 1034 ) 1035 table.set("this", new_this) 1036 table.set("db", seq_get(table_parts, -3)) 1037 table.set("catalog", seq_get(table_parts, -4)) 1038 1039 return table 1040 1041 def _parse_column(self) -> t.Optional[exp.Expression]: 1042 column = super()._parse_column() 1043 if isinstance(column, exp.Column): 1044 parts = column.parts 1045 if any("." in p.name for p in parts): 1046 catalog, db, table, this, *rest = ( 1047 exp.to_identifier(p, quoted=True) 1048 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1049 ) 1050 1051 if rest and this: 1052 this = exp.Dot.build([this, *rest]) # type: ignore 1053 1054 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1055 column.meta["quoted_column"] = True 1056 1057 return column 1058 1059 @t.overload 1060 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1061 1062 @t.overload 1063 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1064 1065 def _parse_json_object(self, agg=False): 1066 json_object = super()._parse_json_object() 1067 array_kv_pair = seq_get(json_object.expressions, 0) 1068 1069 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1070 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1071 if ( 1072 array_kv_pair 1073 and isinstance(array_kv_pair.this, exp.Array) 1074 and isinstance(array_kv_pair.expression, exp.Array) 1075 ): 1076 keys = array_kv_pair.this.expressions 1077 values = array_kv_pair.expression.expressions 1078 1079 json_object.set( 1080 "expressions", 1081 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1082 ) 1083 1084 return json_object 1085 1086 def _parse_bracket( 1087 self, this: t.Optional[exp.Expression] = None 1088 ) -> t.Optional[exp.Expression]: 1089 bracket = super()._parse_bracket(this) 1090 1091 if this is bracket: 1092 return bracket 1093 1094 if isinstance(bracket, exp.Bracket): 1095 for expression in bracket.expressions: 1096 name = expression.name.upper() 1097 1098 if name not in self.BRACKET_OFFSETS: 1099 break 1100 1101 offset, safe = self.BRACKET_OFFSETS[name] 1102 bracket.set("offset", offset) 1103 bracket.set("safe", safe) 1104 expression.replace(expression.expressions[0]) 1105 1106 return bracket 1107 1108 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1109 unnest = super()._parse_unnest(with_alias=with_alias) 1110 1111 if not unnest: 1112 return None 1113 1114 unnest_expr = seq_get(unnest.expressions, 0) 1115 if unnest_expr: 1116 from sqlglot.optimizer.annotate_types import annotate_types 1117 1118 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1119 1120 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1121 # in contrast to other dialects such as DuckDB which flattens only the array by default 1122 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1123 array_elem.is_type(exp.DataType.Type.STRUCT) 1124 for array_elem in unnest_expr._type.expressions 1125 ): 1126 unnest.set("explode_array", True) 1127 1128 return unnest 1129 1130 def _parse_make_interval(self) -> exp.MakeInterval: 1131 expr = exp.MakeInterval() 1132 1133 for arg_key in expr.arg_types: 1134 value = self._parse_lambda() 1135 1136 if not value: 1137 break 1138 1139 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1140 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1141 if isinstance(value, exp.Kwarg): 1142 arg_key = value.this.name 1143 1144 expr.set(arg_key, value) 1145 1146 self._match(TokenType.COMMA) 1147 1148 return expr 1149 1150 def _parse_predict(self) -> exp.Predict: 1151 self._match_text_seq("MODEL") 1152 this = self._parse_table() 1153 1154 self._match(TokenType.COMMA) 1155 self._match_text_seq("TABLE") 1156 1157 return self.expression( 1158 exp.Predict, 1159 this=this, 1160 expression=self._parse_table(), 1161 params_struct=self._match(TokenType.COMMA) and self._parse_bitwise(), 1162 ) 1163 1164 def _parse_generate_embedding(self) -> exp.GenerateEmbedding: 1165 self._match_text_seq("MODEL") 1166 this = self._parse_table() 1167 1168 self._match(TokenType.COMMA) 1169 self._match_text_seq("TABLE") 1170 1171 return self.expression( 1172 exp.GenerateEmbedding, 1173 this=this, 1174 expression=self._parse_table(), 1175 params_struct=self._match(TokenType.COMMA) and self._parse_bitwise(), 1176 ) 1177 1178 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1179 self._match(TokenType.TABLE) 1180 this = self._parse_table() 1181 1182 expr = self.expression(exp.FeaturesAtTime, this=this) 1183 1184 while self._match(TokenType.COMMA): 1185 arg = self._parse_lambda() 1186 1187 # Get the LHS of the Kwarg and set the arg to that value, e.g 1188 # "num_rows => 1" sets the expr's `num_rows` arg 1189 if arg: 1190 expr.set(arg.this.name, arg) 1191 1192 return expr 1193 1194 def _parse_vector_search(self) -> exp.VectorSearch: 1195 self._match(TokenType.TABLE) 1196 base_table = self._parse_table() 1197 1198 self._match(TokenType.COMMA) 1199 1200 column_to_search = self._parse_bitwise() 1201 self._match(TokenType.COMMA) 1202 1203 self._match(TokenType.TABLE) 1204 query_table = self._parse_table() 1205 1206 expr = self.expression( 1207 exp.VectorSearch, 1208 this=base_table, 1209 column_to_search=column_to_search, 1210 query_table=query_table, 1211 ) 1212 1213 while self._match(TokenType.COMMA): 1214 # query_column_to_search can be named argument or positional 1215 if self._match(TokenType.STRING, advance=False): 1216 query_column = self._parse_string() 1217 expr.set("query_column_to_search", query_column) 1218 else: 1219 arg = self._parse_lambda() 1220 if arg: 1221 expr.set(arg.this.name, arg) 1222 1223 return expr 1224 1225 def _parse_export_data(self) -> exp.Export: 1226 self._match_text_seq("DATA") 1227 1228 return self.expression( 1229 exp.Export, 1230 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1231 options=self._parse_properties(), 1232 this=self._match_text_seq("AS") and self._parse_select(), 1233 ) 1234 1235 class Generator(generator.Generator): 1236 INTERVAL_ALLOWS_PLURAL_FORM = False 1237 JOIN_HINTS = False 1238 QUERY_HINTS = False 1239 TABLE_HINTS = False 1240 LIMIT_FETCH = "LIMIT" 1241 RENAME_TABLE_WITH_DB = False 1242 NVL2_SUPPORTED = False 1243 UNNEST_WITH_ORDINALITY = False 1244 COLLATE_IS_FUNC = True 1245 LIMIT_ONLY_LITERALS = True 1246 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1247 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1248 JSON_KEY_VALUE_PAIR_SEP = "," 1249 NULL_ORDERING_SUPPORTED = False 1250 IGNORE_NULLS_IN_FUNC = True 1251 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1252 CAN_IMPLEMENT_ARRAY_ANY = True 1253 SUPPORTS_TO_NUMBER = False 1254 NAMED_PLACEHOLDER_TOKEN = "@" 1255 HEX_FUNC = "TO_HEX" 1256 WITH_PROPERTIES_PREFIX = "OPTIONS" 1257 SUPPORTS_EXPLODING_PROJECTIONS = False 1258 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1259 SUPPORTS_UNIX_SECONDS = True 1260 1261 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1262 1263 TS_OR_DS_TYPES = ( 1264 exp.TsOrDsToDatetime, 1265 exp.TsOrDsToTimestamp, 1266 exp.TsOrDsToTime, 1267 exp.TsOrDsToDate, 1268 ) 1269 1270 TRANSFORMS = { 1271 **generator.Generator.TRANSFORMS, 1272 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1273 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1274 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1275 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1276 exp.Array: inline_array_unless_query, 1277 exp.ArrayContains: _array_contains_sql, 1278 exp.ArrayFilter: filter_array_using_unnest, 1279 exp.ArrayRemove: filter_array_using_unnest, 1280 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1281 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1282 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1283 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1284 exp.ByteLength: rename_func("BYTE_LENGTH"), 1285 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1286 exp.CollateProperty: lambda self, e: ( 1287 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1288 if e.args.get("default") 1289 else f"COLLATE {self.sql(e, 'this')}" 1290 ), 1291 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1292 exp.CountIf: rename_func("COUNTIF"), 1293 exp.Create: _create_sql, 1294 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1295 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1296 exp.DateDiff: lambda self, e: self.func( 1297 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1298 ), 1299 exp.DateFromParts: rename_func("DATE"), 1300 exp.DateStrToDate: datestrtodate_sql, 1301 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1302 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1303 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1304 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1305 exp.FromTimeZone: lambda self, e: self.func( 1306 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1307 ), 1308 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1309 exp.GroupConcat: lambda self, e: groupconcat_sql( 1310 self, e, func_name="STRING_AGG", within_group=False 1311 ), 1312 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1313 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1314 exp.If: if_sql(false_value="NULL"), 1315 exp.ILike: no_ilike_sql, 1316 exp.IntDiv: rename_func("DIV"), 1317 exp.Int64: rename_func("INT64"), 1318 exp.JSONBool: rename_func("BOOL"), 1319 exp.JSONExtract: _json_extract_sql, 1320 exp.JSONExtractArray: _json_extract_sql, 1321 exp.JSONExtractScalar: _json_extract_sql, 1322 exp.JSONFormat: lambda self, e: self.func( 1323 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1324 e.this, 1325 e.args.get("options"), 1326 ), 1327 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1328 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1329 exp.Levenshtein: _levenshtein_sql, 1330 exp.Max: max_or_greatest, 1331 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1332 exp.MD5Digest: rename_func("MD5"), 1333 exp.Min: min_or_least, 1334 exp.Normalize: lambda self, e: self.func( 1335 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1336 e.this, 1337 e.args.get("form"), 1338 ), 1339 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1340 exp.RegexpExtract: lambda self, e: self.func( 1341 "REGEXP_EXTRACT", 1342 e.this, 1343 e.expression, 1344 e.args.get("position"), 1345 e.args.get("occurrence"), 1346 ), 1347 exp.RegexpExtractAll: lambda self, e: self.func( 1348 "REGEXP_EXTRACT_ALL", e.this, e.expression 1349 ), 1350 exp.RegexpReplace: regexp_replace_sql, 1351 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1352 exp.ReturnsProperty: _returnsproperty_sql, 1353 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1354 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1355 exp.ParseDatetime: lambda self, e: self.func( 1356 "PARSE_DATETIME", self.format_time(e), e.this 1357 ), 1358 exp.Select: transforms.preprocess( 1359 [ 1360 transforms.explode_projection_to_unnest(), 1361 transforms.unqualify_unnest, 1362 transforms.eliminate_distinct_on, 1363 _alias_ordered_group, 1364 transforms.eliminate_semi_and_anti_joins, 1365 ] 1366 ), 1367 exp.SHA: rename_func("SHA1"), 1368 exp.SHA2: sha256_sql, 1369 exp.StabilityProperty: lambda self, e: ( 1370 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1371 ), 1372 exp.String: rename_func("STRING"), 1373 exp.StrPosition: lambda self, e: ( 1374 strposition_sql( 1375 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1376 ) 1377 ), 1378 exp.StrToDate: _str_to_datetime_sql, 1379 exp.StrToTime: _str_to_datetime_sql, 1380 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1381 exp.TimeFromParts: rename_func("TIME"), 1382 exp.TimestampFromParts: rename_func("DATETIME"), 1383 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1384 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1385 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1386 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1387 exp.TimeStrToTime: timestrtotime_sql, 1388 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1389 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1390 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1391 exp.TsOrDsToTime: rename_func("TIME"), 1392 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1393 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1394 exp.Unhex: rename_func("FROM_HEX"), 1395 exp.UnixDate: rename_func("UNIX_DATE"), 1396 exp.UnixToTime: _unix_to_time_sql, 1397 exp.Uuid: lambda *_: "GENERATE_UUID()", 1398 exp.Values: _derived_table_values_to_unnest, 1399 exp.VariancePop: rename_func("VAR_POP"), 1400 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1401 } 1402 1403 SUPPORTED_JSON_PATH_PARTS = { 1404 exp.JSONPathKey, 1405 exp.JSONPathRoot, 1406 exp.JSONPathSubscript, 1407 } 1408 1409 TYPE_MAPPING = { 1410 **generator.Generator.TYPE_MAPPING, 1411 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1412 exp.DataType.Type.BIGINT: "INT64", 1413 exp.DataType.Type.BINARY: "BYTES", 1414 exp.DataType.Type.BLOB: "BYTES", 1415 exp.DataType.Type.BOOLEAN: "BOOL", 1416 exp.DataType.Type.CHAR: "STRING", 1417 exp.DataType.Type.DECIMAL: "NUMERIC", 1418 exp.DataType.Type.DOUBLE: "FLOAT64", 1419 exp.DataType.Type.FLOAT: "FLOAT64", 1420 exp.DataType.Type.INT: "INT64", 1421 exp.DataType.Type.NCHAR: "STRING", 1422 exp.DataType.Type.NVARCHAR: "STRING", 1423 exp.DataType.Type.SMALLINT: "INT64", 1424 exp.DataType.Type.TEXT: "STRING", 1425 exp.DataType.Type.TIMESTAMP: "DATETIME", 1426 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1427 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1428 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1429 exp.DataType.Type.TINYINT: "INT64", 1430 exp.DataType.Type.ROWVERSION: "BYTES", 1431 exp.DataType.Type.UUID: "STRING", 1432 exp.DataType.Type.VARBINARY: "BYTES", 1433 exp.DataType.Type.VARCHAR: "STRING", 1434 exp.DataType.Type.VARIANT: "ANY TYPE", 1435 } 1436 1437 PROPERTIES_LOCATION = { 1438 **generator.Generator.PROPERTIES_LOCATION, 1439 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1440 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1441 } 1442 1443 # WINDOW comes after QUALIFY 1444 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1445 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1446 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1447 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1448 } 1449 1450 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1451 RESERVED_KEYWORDS = { 1452 "all", 1453 "and", 1454 "any", 1455 "array", 1456 "as", 1457 "asc", 1458 "assert_rows_modified", 1459 "at", 1460 "between", 1461 "by", 1462 "case", 1463 "cast", 1464 "collate", 1465 "contains", 1466 "create", 1467 "cross", 1468 "cube", 1469 "current", 1470 "default", 1471 "define", 1472 "desc", 1473 "distinct", 1474 "else", 1475 "end", 1476 "enum", 1477 "escape", 1478 "except", 1479 "exclude", 1480 "exists", 1481 "extract", 1482 "false", 1483 "fetch", 1484 "following", 1485 "for", 1486 "from", 1487 "full", 1488 "group", 1489 "grouping", 1490 "groups", 1491 "hash", 1492 "having", 1493 "if", 1494 "ignore", 1495 "in", 1496 "inner", 1497 "intersect", 1498 "interval", 1499 "into", 1500 "is", 1501 "join", 1502 "lateral", 1503 "left", 1504 "like", 1505 "limit", 1506 "lookup", 1507 "merge", 1508 "natural", 1509 "new", 1510 "no", 1511 "not", 1512 "null", 1513 "nulls", 1514 "of", 1515 "on", 1516 "or", 1517 "order", 1518 "outer", 1519 "over", 1520 "partition", 1521 "preceding", 1522 "proto", 1523 "qualify", 1524 "range", 1525 "recursive", 1526 "respect", 1527 "right", 1528 "rollup", 1529 "rows", 1530 "select", 1531 "set", 1532 "some", 1533 "struct", 1534 "tablesample", 1535 "then", 1536 "to", 1537 "treat", 1538 "true", 1539 "unbounded", 1540 "union", 1541 "unnest", 1542 "using", 1543 "when", 1544 "where", 1545 "window", 1546 "with", 1547 "within", 1548 } 1549 1550 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1551 unit = expression.unit 1552 unit_sql = unit.name if unit.is_string else self.sql(unit) 1553 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1554 1555 def mod_sql(self, expression: exp.Mod) -> str: 1556 this = expression.this 1557 expr = expression.expression 1558 return self.func( 1559 "MOD", 1560 this.unnest() if isinstance(this, exp.Paren) else this, 1561 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1562 ) 1563 1564 def column_parts(self, expression: exp.Column) -> str: 1565 if expression.meta.get("quoted_column"): 1566 # If a column reference is of the form `dataset.table`.name, we need 1567 # to preserve the quoted table path, otherwise the reference breaks 1568 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1569 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1570 return f"{table_path}.{self.sql(expression, 'this')}" 1571 1572 return super().column_parts(expression) 1573 1574 def table_parts(self, expression: exp.Table) -> str: 1575 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1576 # we need to make sure the correct quoting is used in each case. 1577 # 1578 # For example, if there is a CTE x that clashes with a schema name, then the former will 1579 # return the table y in that schema, whereas the latter will return the CTE's y column: 1580 # 1581 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1582 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1583 if expression.meta.get("quoted_table"): 1584 table_parts = ".".join(p.name for p in expression.parts) 1585 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1586 1587 return super().table_parts(expression) 1588 1589 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1590 this = expression.this 1591 if isinstance(this, exp.TsOrDsToDatetime): 1592 func_name = "FORMAT_DATETIME" 1593 elif isinstance(this, exp.TsOrDsToTimestamp): 1594 func_name = "FORMAT_TIMESTAMP" 1595 elif isinstance(this, exp.TsOrDsToTime): 1596 func_name = "FORMAT_TIME" 1597 else: 1598 func_name = "FORMAT_DATE" 1599 1600 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1601 return self.func( 1602 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1603 ) 1604 1605 def eq_sql(self, expression: exp.EQ) -> str: 1606 # Operands of = cannot be NULL in BigQuery 1607 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1608 if not isinstance(expression.parent, exp.Update): 1609 return "NULL" 1610 1611 return self.binary(expression, "=") 1612 1613 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1614 parent = expression.parent 1615 1616 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1617 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1618 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1619 return self.func( 1620 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1621 ) 1622 1623 return super().attimezone_sql(expression) 1624 1625 def trycast_sql(self, expression: exp.TryCast) -> str: 1626 return self.cast_sql(expression, safe_prefix="SAFE_") 1627 1628 def bracket_sql(self, expression: exp.Bracket) -> str: 1629 this = expression.this 1630 expressions = expression.expressions 1631 1632 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1633 arg = expressions[0] 1634 if arg.type is None: 1635 from sqlglot.optimizer.annotate_types import annotate_types 1636 1637 arg = annotate_types(arg, dialect=self.dialect) 1638 1639 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1640 # BQ doesn't support bracket syntax with string values for structs 1641 return f"{self.sql(this)}.{arg.name}" 1642 1643 expressions_sql = self.expressions(expression, flat=True) 1644 offset = expression.args.get("offset") 1645 1646 if offset == 0: 1647 expressions_sql = f"OFFSET({expressions_sql})" 1648 elif offset == 1: 1649 expressions_sql = f"ORDINAL({expressions_sql})" 1650 elif offset is not None: 1651 self.unsupported(f"Unsupported array offset: {offset}") 1652 1653 if expression.args.get("safe"): 1654 expressions_sql = f"SAFE_{expressions_sql}" 1655 1656 return f"{self.sql(this)}[{expressions_sql}]" 1657 1658 def in_unnest_op(self, expression: exp.Unnest) -> str: 1659 return self.sql(expression) 1660 1661 def version_sql(self, expression: exp.Version) -> str: 1662 if expression.name == "TIMESTAMP": 1663 expression.set("this", "SYSTEM_TIME") 1664 return super().version_sql(expression) 1665 1666 def contains_sql(self, expression: exp.Contains) -> str: 1667 this = expression.this 1668 expr = expression.expression 1669 1670 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1671 this = this.this 1672 expr = expr.this 1673 1674 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1675 1676 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1677 this = expression.this 1678 1679 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1680 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1681 # because they aren't literals and so the above syntax is invalid BigQuery. 1682 if isinstance(this, exp.Array): 1683 elem = seq_get(this.expressions, 0) 1684 if not (elem and elem.find(exp.Query)): 1685 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1686 1687 return super().cast_sql(expression, safe_prefix=safe_prefix) 1688 1689 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1690 variables = self.expressions(expression, "this") 1691 default = self.sql(expression, "default") 1692 default = f" DEFAULT {default}" if default else "" 1693 kind = self.sql(expression, "kind") 1694 kind = f" {kind}" if kind else "" 1695 1696 return f"{variables}{kind}{default}"
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG function.
Possible values: True, False, None (two arguments are not supported by LOG)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy').
If empty, the corresponding trie will be constructed off of TIME_MAPPING.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT * queries.
Whether a set operation uses DISTINCT by default. This is None when either DISTINCT or ALL
must be explicitly specified.
672 def normalize_identifier(self, expression: E) -> E: 673 if ( 674 isinstance(expression, exp.Identifier) 675 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 676 ): 677 parent = expression.parent 678 while isinstance(parent, exp.Dot): 679 parent = parent.parent 680 681 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 682 # by default. The following check uses a heuristic to detect tables based on whether 683 # they are qualified. This should generally be correct, because tables in BigQuery 684 # must be qualified with at least a dataset, unless @@dataset_id is set. 685 case_sensitive = ( 686 isinstance(parent, exp.UserDefinedFunction) 687 or ( 688 isinstance(parent, exp.Table) 689 and parent.db 690 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 691 ) 692 or expression.meta.get("is_table") 693 ) 694 if not case_sensitive: 695 expression.set("this", expression.this.lower()) 696 697 return t.cast(E, expression) 698 699 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO would be resolved as foo in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n) to its unescaped version (
).
701 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 702 VAR_TOKENS = { 703 TokenType.DASH, 704 TokenType.VAR, 705 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- BYTE_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- IDENTIFIERS
- QUOTES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- COMMENTS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
707 class Tokenizer(tokens.Tokenizer): 708 QUOTES = ["'", '"', '"""', "'''"] 709 COMMENTS = ["--", "#", ("/*", "*/")] 710 IDENTIFIERS = ["`"] 711 STRING_ESCAPES = ["\\"] 712 713 HEX_STRINGS = [("0x", ""), ("0X", "")] 714 715 BYTE_STRINGS = [ 716 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 717 ] 718 719 RAW_STRINGS = [ 720 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 721 ] 722 723 NESTED_COMMENTS = False 724 725 KEYWORDS = { 726 **tokens.Tokenizer.KEYWORDS, 727 "ANY TYPE": TokenType.VARIANT, 728 "BEGIN": TokenType.COMMAND, 729 "BEGIN TRANSACTION": TokenType.BEGIN, 730 "BYTEINT": TokenType.INT, 731 "BYTES": TokenType.BINARY, 732 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 733 "DATETIME": TokenType.TIMESTAMP, 734 "DECLARE": TokenType.DECLARE, 735 "ELSEIF": TokenType.COMMAND, 736 "EXCEPTION": TokenType.COMMAND, 737 "EXPORT": TokenType.EXPORT, 738 "FLOAT64": TokenType.DOUBLE, 739 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 740 "LOOP": TokenType.COMMAND, 741 "MODEL": TokenType.MODEL, 742 "NOT DETERMINISTIC": TokenType.VOLATILE, 743 "RECORD": TokenType.STRUCT, 744 "REPEAT": TokenType.COMMAND, 745 "TIMESTAMP": TokenType.TIMESTAMPTZ, 746 "WHILE": TokenType.COMMAND, 747 } 748 KEYWORDS.pop("DIV") 749 KEYWORDS.pop("VALUES") 750 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
752 class Parser(parser.Parser): 753 PREFIXED_PIVOT_COLUMNS = True 754 LOG_DEFAULTS_TO_LN = True 755 SUPPORTS_IMPLICIT_UNNEST = True 756 JOINS_HAVE_EQUAL_PRECEDENCE = True 757 758 # BigQuery does not allow ASC/DESC to be used as an identifier 759 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 760 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 761 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 762 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 763 TokenType.ASC, 764 TokenType.DESC, 765 } 766 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 767 768 FUNCTIONS = { 769 **parser.Parser.FUNCTIONS, 770 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 771 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 772 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 773 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 774 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 775 "BOOL": exp.JSONBool.from_arg_list, 776 "CONTAINS_SUBSTR": _build_contains_substring, 777 "DATE": _build_date, 778 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 779 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 780 "DATE_TRUNC": lambda args: exp.DateTrunc( 781 unit=seq_get(args, 1), 782 this=seq_get(args, 0), 783 zone=seq_get(args, 2), 784 ), 785 "DATETIME": _build_datetime, 786 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 787 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 788 "DIV": binary_from_function(exp.IntDiv), 789 "EDIT_DISTANCE": _build_levenshtein, 790 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 791 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 792 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 793 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 794 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 795 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 796 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 797 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 798 "JSON_STRIP_NULLS": _build_json_strip_nulls, 799 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 800 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 801 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 802 "MD5": exp.MD5Digest.from_arg_list, 803 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 804 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 805 ), 806 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 807 "TO_HEX": _build_to_hex, 808 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 809 [seq_get(args, 1), seq_get(args, 0)] 810 ), 811 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 812 [seq_get(args, 1), seq_get(args, 0)] 813 ), 814 "PARSE_TIMESTAMP": _build_parse_timestamp, 815 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 816 [seq_get(args, 1), seq_get(args, 0)] 817 ), 818 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 819 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 820 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 821 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 822 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 823 ), 824 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 825 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 826 "SPLIT": lambda args: exp.Split( 827 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 828 this=seq_get(args, 0), 829 expression=seq_get(args, 1) or exp.Literal.string(","), 830 ), 831 "STRPOS": exp.StrPosition.from_arg_list, 832 "TIME": _build_time, 833 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 834 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 835 "TIMESTAMP": _build_timestamp, 836 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 837 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 838 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 839 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 840 ), 841 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 842 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 843 ), 844 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 845 "TO_JSON": lambda args: exp.JSONFormat( 846 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 847 ), 848 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 849 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 850 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 851 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 852 "FROM_HEX": exp.Unhex.from_arg_list, 853 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 854 } 855 856 FUNCTION_PARSERS = { 857 **parser.Parser.FUNCTION_PARSERS, 858 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 859 "JSON_ARRAY": lambda self: self.expression( 860 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 861 ), 862 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 863 "PREDICT": lambda self: self._parse_predict(), 864 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 865 "GENERATE_EMBEDDING": lambda self: self._parse_generate_embedding(), 866 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 867 } 868 FUNCTION_PARSERS.pop("TRIM") 869 870 NO_PAREN_FUNCTIONS = { 871 **parser.Parser.NO_PAREN_FUNCTIONS, 872 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 873 } 874 875 NESTED_TYPE_TOKENS = { 876 *parser.Parser.NESTED_TYPE_TOKENS, 877 TokenType.TABLE, 878 } 879 880 PROPERTY_PARSERS = { 881 **parser.Parser.PROPERTY_PARSERS, 882 "NOT DETERMINISTIC": lambda self: self.expression( 883 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 884 ), 885 "OPTIONS": lambda self: self._parse_with_property(), 886 } 887 888 CONSTRAINT_PARSERS = { 889 **parser.Parser.CONSTRAINT_PARSERS, 890 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 891 } 892 893 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 894 RANGE_PARSERS.pop(TokenType.OVERLAPS) 895 896 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 897 898 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 899 900 STATEMENT_PARSERS = { 901 **parser.Parser.STATEMENT_PARSERS, 902 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 903 TokenType.END: lambda self: self._parse_as_command(self._prev), 904 TokenType.FOR: lambda self: self._parse_for_in(), 905 TokenType.EXPORT: lambda self: self._parse_export_data(), 906 TokenType.DECLARE: lambda self: self._parse_declare(), 907 } 908 909 BRACKET_OFFSETS = { 910 "OFFSET": (0, False), 911 "ORDINAL": (1, False), 912 "SAFE_OFFSET": (0, True), 913 "SAFE_ORDINAL": (1, True), 914 } 915 916 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 917 index = self._index 918 this = self._parse_range() 919 self._match_text_seq("DO") 920 if self._match(TokenType.COMMAND): 921 self._retreat(index) 922 return self._parse_as_command(self._prev) 923 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 924 925 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 926 this = super()._parse_table_part(schema=schema) or self._parse_number() 927 928 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 929 if isinstance(this, exp.Identifier): 930 table_name = this.name 931 while self._match(TokenType.DASH, advance=False) and self._next: 932 start = self._curr 933 while self._is_connected() and not self._match_set( 934 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 935 ): 936 self._advance() 937 938 if start == self._curr: 939 break 940 941 table_name += self._find_sql(start, self._prev) 942 943 this = exp.Identifier( 944 this=table_name, quoted=this.args.get("quoted") 945 ).update_positions(this) 946 elif isinstance(this, exp.Literal): 947 table_name = this.name 948 949 if self._is_connected() and self._parse_var(any_token=True): 950 table_name += self._prev.text 951 952 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 953 954 return this 955 956 def _parse_table_parts( 957 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 958 ) -> exp.Table: 959 table = super()._parse_table_parts( 960 schema=schema, is_db_reference=is_db_reference, wildcard=True 961 ) 962 963 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 964 if not table.catalog: 965 if table.db: 966 previous_db = table.args["db"] 967 parts = table.db.split(".") 968 if len(parts) == 2 and not table.args["db"].quoted: 969 table.set( 970 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 971 ) 972 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 973 else: 974 previous_this = table.this 975 parts = table.name.split(".") 976 if len(parts) == 2 and not table.this.quoted: 977 table.set( 978 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 979 ) 980 table.set( 981 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 982 ) 983 984 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 985 alias = table.this 986 catalog, db, this, *rest = ( 987 exp.to_identifier(p, quoted=True) 988 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 989 ) 990 991 for part in (catalog, db, this): 992 if part: 993 part.update_positions(table.this) 994 995 if rest and this: 996 this = exp.Dot.build([this, *rest]) # type: ignore 997 998 table = exp.Table( 999 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1000 ) 1001 table.meta["quoted_table"] = True 1002 else: 1003 alias = None 1004 1005 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1006 # dataset, so if the project identifier is omitted we need to fix the ast so that 1007 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1008 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1009 # views, because it would seem like the "catalog" part is set, when it'd actually 1010 # be the region/dataset. Merging the two identifiers into a single one is done to 1011 # avoid producing a 4-part Table reference, which would cause issues in the schema 1012 # module, when there are 3-part table names mixed with information schema views. 1013 # 1014 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1015 table_parts = table.parts 1016 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1017 # We need to alias the table here to avoid breaking existing qualified columns. 1018 # This is expected to be safe, because if there's an actual alias coming up in 1019 # the token stream, it will overwrite this one. If there isn't one, we are only 1020 # exposing the name that can be used to reference the view explicitly (a no-op). 1021 exp.alias_( 1022 table, 1023 t.cast(exp.Identifier, alias or table_parts[-1]), 1024 table=True, 1025 copy=False, 1026 ) 1027 1028 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1029 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1030 line=table_parts[-2].meta.get("line"), 1031 col=table_parts[-1].meta.get("col"), 1032 start=table_parts[-2].meta.get("start"), 1033 end=table_parts[-1].meta.get("end"), 1034 ) 1035 table.set("this", new_this) 1036 table.set("db", seq_get(table_parts, -3)) 1037 table.set("catalog", seq_get(table_parts, -4)) 1038 1039 return table 1040 1041 def _parse_column(self) -> t.Optional[exp.Expression]: 1042 column = super()._parse_column() 1043 if isinstance(column, exp.Column): 1044 parts = column.parts 1045 if any("." in p.name for p in parts): 1046 catalog, db, table, this, *rest = ( 1047 exp.to_identifier(p, quoted=True) 1048 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1049 ) 1050 1051 if rest and this: 1052 this = exp.Dot.build([this, *rest]) # type: ignore 1053 1054 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1055 column.meta["quoted_column"] = True 1056 1057 return column 1058 1059 @t.overload 1060 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1061 1062 @t.overload 1063 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1064 1065 def _parse_json_object(self, agg=False): 1066 json_object = super()._parse_json_object() 1067 array_kv_pair = seq_get(json_object.expressions, 0) 1068 1069 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1070 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1071 if ( 1072 array_kv_pair 1073 and isinstance(array_kv_pair.this, exp.Array) 1074 and isinstance(array_kv_pair.expression, exp.Array) 1075 ): 1076 keys = array_kv_pair.this.expressions 1077 values = array_kv_pair.expression.expressions 1078 1079 json_object.set( 1080 "expressions", 1081 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1082 ) 1083 1084 return json_object 1085 1086 def _parse_bracket( 1087 self, this: t.Optional[exp.Expression] = None 1088 ) -> t.Optional[exp.Expression]: 1089 bracket = super()._parse_bracket(this) 1090 1091 if this is bracket: 1092 return bracket 1093 1094 if isinstance(bracket, exp.Bracket): 1095 for expression in bracket.expressions: 1096 name = expression.name.upper() 1097 1098 if name not in self.BRACKET_OFFSETS: 1099 break 1100 1101 offset, safe = self.BRACKET_OFFSETS[name] 1102 bracket.set("offset", offset) 1103 bracket.set("safe", safe) 1104 expression.replace(expression.expressions[0]) 1105 1106 return bracket 1107 1108 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1109 unnest = super()._parse_unnest(with_alias=with_alias) 1110 1111 if not unnest: 1112 return None 1113 1114 unnest_expr = seq_get(unnest.expressions, 0) 1115 if unnest_expr: 1116 from sqlglot.optimizer.annotate_types import annotate_types 1117 1118 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1119 1120 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1121 # in contrast to other dialects such as DuckDB which flattens only the array by default 1122 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1123 array_elem.is_type(exp.DataType.Type.STRUCT) 1124 for array_elem in unnest_expr._type.expressions 1125 ): 1126 unnest.set("explode_array", True) 1127 1128 return unnest 1129 1130 def _parse_make_interval(self) -> exp.MakeInterval: 1131 expr = exp.MakeInterval() 1132 1133 for arg_key in expr.arg_types: 1134 value = self._parse_lambda() 1135 1136 if not value: 1137 break 1138 1139 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1140 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1141 if isinstance(value, exp.Kwarg): 1142 arg_key = value.this.name 1143 1144 expr.set(arg_key, value) 1145 1146 self._match(TokenType.COMMA) 1147 1148 return expr 1149 1150 def _parse_predict(self) -> exp.Predict: 1151 self._match_text_seq("MODEL") 1152 this = self._parse_table() 1153 1154 self._match(TokenType.COMMA) 1155 self._match_text_seq("TABLE") 1156 1157 return self.expression( 1158 exp.Predict, 1159 this=this, 1160 expression=self._parse_table(), 1161 params_struct=self._match(TokenType.COMMA) and self._parse_bitwise(), 1162 ) 1163 1164 def _parse_generate_embedding(self) -> exp.GenerateEmbedding: 1165 self._match_text_seq("MODEL") 1166 this = self._parse_table() 1167 1168 self._match(TokenType.COMMA) 1169 self._match_text_seq("TABLE") 1170 1171 return self.expression( 1172 exp.GenerateEmbedding, 1173 this=this, 1174 expression=self._parse_table(), 1175 params_struct=self._match(TokenType.COMMA) and self._parse_bitwise(), 1176 ) 1177 1178 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1179 self._match(TokenType.TABLE) 1180 this = self._parse_table() 1181 1182 expr = self.expression(exp.FeaturesAtTime, this=this) 1183 1184 while self._match(TokenType.COMMA): 1185 arg = self._parse_lambda() 1186 1187 # Get the LHS of the Kwarg and set the arg to that value, e.g 1188 # "num_rows => 1" sets the expr's `num_rows` arg 1189 if arg: 1190 expr.set(arg.this.name, arg) 1191 1192 return expr 1193 1194 def _parse_vector_search(self) -> exp.VectorSearch: 1195 self._match(TokenType.TABLE) 1196 base_table = self._parse_table() 1197 1198 self._match(TokenType.COMMA) 1199 1200 column_to_search = self._parse_bitwise() 1201 self._match(TokenType.COMMA) 1202 1203 self._match(TokenType.TABLE) 1204 query_table = self._parse_table() 1205 1206 expr = self.expression( 1207 exp.VectorSearch, 1208 this=base_table, 1209 column_to_search=column_to_search, 1210 query_table=query_table, 1211 ) 1212 1213 while self._match(TokenType.COMMA): 1214 # query_column_to_search can be named argument or positional 1215 if self._match(TokenType.STRING, advance=False): 1216 query_column = self._parse_string() 1217 expr.set("query_column_to_search", query_column) 1218 else: 1219 arg = self._parse_lambda() 1220 if arg: 1221 expr.set(arg.this.name, arg) 1222 1223 return expr 1224 1225 def _parse_export_data(self) -> exp.Export: 1226 self._match_text_seq("DATA") 1227 1228 return self.expression( 1229 exp.Export, 1230 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1231 options=self._parse_properties(), 1232 this=self._match_text_seq("AS") and self._parse_select(), 1233 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
1235 class Generator(generator.Generator): 1236 INTERVAL_ALLOWS_PLURAL_FORM = False 1237 JOIN_HINTS = False 1238 QUERY_HINTS = False 1239 TABLE_HINTS = False 1240 LIMIT_FETCH = "LIMIT" 1241 RENAME_TABLE_WITH_DB = False 1242 NVL2_SUPPORTED = False 1243 UNNEST_WITH_ORDINALITY = False 1244 COLLATE_IS_FUNC = True 1245 LIMIT_ONLY_LITERALS = True 1246 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1247 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1248 JSON_KEY_VALUE_PAIR_SEP = "," 1249 NULL_ORDERING_SUPPORTED = False 1250 IGNORE_NULLS_IN_FUNC = True 1251 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1252 CAN_IMPLEMENT_ARRAY_ANY = True 1253 SUPPORTS_TO_NUMBER = False 1254 NAMED_PLACEHOLDER_TOKEN = "@" 1255 HEX_FUNC = "TO_HEX" 1256 WITH_PROPERTIES_PREFIX = "OPTIONS" 1257 SUPPORTS_EXPLODING_PROJECTIONS = False 1258 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1259 SUPPORTS_UNIX_SECONDS = True 1260 1261 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1262 1263 TS_OR_DS_TYPES = ( 1264 exp.TsOrDsToDatetime, 1265 exp.TsOrDsToTimestamp, 1266 exp.TsOrDsToTime, 1267 exp.TsOrDsToDate, 1268 ) 1269 1270 TRANSFORMS = { 1271 **generator.Generator.TRANSFORMS, 1272 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1273 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1274 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1275 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1276 exp.Array: inline_array_unless_query, 1277 exp.ArrayContains: _array_contains_sql, 1278 exp.ArrayFilter: filter_array_using_unnest, 1279 exp.ArrayRemove: filter_array_using_unnest, 1280 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1281 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1282 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1283 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1284 exp.ByteLength: rename_func("BYTE_LENGTH"), 1285 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1286 exp.CollateProperty: lambda self, e: ( 1287 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1288 if e.args.get("default") 1289 else f"COLLATE {self.sql(e, 'this')}" 1290 ), 1291 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1292 exp.CountIf: rename_func("COUNTIF"), 1293 exp.Create: _create_sql, 1294 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1295 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1296 exp.DateDiff: lambda self, e: self.func( 1297 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1298 ), 1299 exp.DateFromParts: rename_func("DATE"), 1300 exp.DateStrToDate: datestrtodate_sql, 1301 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1302 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1303 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1304 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1305 exp.FromTimeZone: lambda self, e: self.func( 1306 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1307 ), 1308 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1309 exp.GroupConcat: lambda self, e: groupconcat_sql( 1310 self, e, func_name="STRING_AGG", within_group=False 1311 ), 1312 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1313 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1314 exp.If: if_sql(false_value="NULL"), 1315 exp.ILike: no_ilike_sql, 1316 exp.IntDiv: rename_func("DIV"), 1317 exp.Int64: rename_func("INT64"), 1318 exp.JSONBool: rename_func("BOOL"), 1319 exp.JSONExtract: _json_extract_sql, 1320 exp.JSONExtractArray: _json_extract_sql, 1321 exp.JSONExtractScalar: _json_extract_sql, 1322 exp.JSONFormat: lambda self, e: self.func( 1323 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1324 e.this, 1325 e.args.get("options"), 1326 ), 1327 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1328 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1329 exp.Levenshtein: _levenshtein_sql, 1330 exp.Max: max_or_greatest, 1331 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1332 exp.MD5Digest: rename_func("MD5"), 1333 exp.Min: min_or_least, 1334 exp.Normalize: lambda self, e: self.func( 1335 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1336 e.this, 1337 e.args.get("form"), 1338 ), 1339 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1340 exp.RegexpExtract: lambda self, e: self.func( 1341 "REGEXP_EXTRACT", 1342 e.this, 1343 e.expression, 1344 e.args.get("position"), 1345 e.args.get("occurrence"), 1346 ), 1347 exp.RegexpExtractAll: lambda self, e: self.func( 1348 "REGEXP_EXTRACT_ALL", e.this, e.expression 1349 ), 1350 exp.RegexpReplace: regexp_replace_sql, 1351 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1352 exp.ReturnsProperty: _returnsproperty_sql, 1353 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1354 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1355 exp.ParseDatetime: lambda self, e: self.func( 1356 "PARSE_DATETIME", self.format_time(e), e.this 1357 ), 1358 exp.Select: transforms.preprocess( 1359 [ 1360 transforms.explode_projection_to_unnest(), 1361 transforms.unqualify_unnest, 1362 transforms.eliminate_distinct_on, 1363 _alias_ordered_group, 1364 transforms.eliminate_semi_and_anti_joins, 1365 ] 1366 ), 1367 exp.SHA: rename_func("SHA1"), 1368 exp.SHA2: sha256_sql, 1369 exp.StabilityProperty: lambda self, e: ( 1370 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1371 ), 1372 exp.String: rename_func("STRING"), 1373 exp.StrPosition: lambda self, e: ( 1374 strposition_sql( 1375 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1376 ) 1377 ), 1378 exp.StrToDate: _str_to_datetime_sql, 1379 exp.StrToTime: _str_to_datetime_sql, 1380 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1381 exp.TimeFromParts: rename_func("TIME"), 1382 exp.TimestampFromParts: rename_func("DATETIME"), 1383 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1384 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1385 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1386 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1387 exp.TimeStrToTime: timestrtotime_sql, 1388 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1389 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1390 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1391 exp.TsOrDsToTime: rename_func("TIME"), 1392 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1393 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1394 exp.Unhex: rename_func("FROM_HEX"), 1395 exp.UnixDate: rename_func("UNIX_DATE"), 1396 exp.UnixToTime: _unix_to_time_sql, 1397 exp.Uuid: lambda *_: "GENERATE_UUID()", 1398 exp.Values: _derived_table_values_to_unnest, 1399 exp.VariancePop: rename_func("VAR_POP"), 1400 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1401 } 1402 1403 SUPPORTED_JSON_PATH_PARTS = { 1404 exp.JSONPathKey, 1405 exp.JSONPathRoot, 1406 exp.JSONPathSubscript, 1407 } 1408 1409 TYPE_MAPPING = { 1410 **generator.Generator.TYPE_MAPPING, 1411 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1412 exp.DataType.Type.BIGINT: "INT64", 1413 exp.DataType.Type.BINARY: "BYTES", 1414 exp.DataType.Type.BLOB: "BYTES", 1415 exp.DataType.Type.BOOLEAN: "BOOL", 1416 exp.DataType.Type.CHAR: "STRING", 1417 exp.DataType.Type.DECIMAL: "NUMERIC", 1418 exp.DataType.Type.DOUBLE: "FLOAT64", 1419 exp.DataType.Type.FLOAT: "FLOAT64", 1420 exp.DataType.Type.INT: "INT64", 1421 exp.DataType.Type.NCHAR: "STRING", 1422 exp.DataType.Type.NVARCHAR: "STRING", 1423 exp.DataType.Type.SMALLINT: "INT64", 1424 exp.DataType.Type.TEXT: "STRING", 1425 exp.DataType.Type.TIMESTAMP: "DATETIME", 1426 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1427 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1428 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1429 exp.DataType.Type.TINYINT: "INT64", 1430 exp.DataType.Type.ROWVERSION: "BYTES", 1431 exp.DataType.Type.UUID: "STRING", 1432 exp.DataType.Type.VARBINARY: "BYTES", 1433 exp.DataType.Type.VARCHAR: "STRING", 1434 exp.DataType.Type.VARIANT: "ANY TYPE", 1435 } 1436 1437 PROPERTIES_LOCATION = { 1438 **generator.Generator.PROPERTIES_LOCATION, 1439 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1440 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1441 } 1442 1443 # WINDOW comes after QUALIFY 1444 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1445 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1446 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1447 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1448 } 1449 1450 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1451 RESERVED_KEYWORDS = { 1452 "all", 1453 "and", 1454 "any", 1455 "array", 1456 "as", 1457 "asc", 1458 "assert_rows_modified", 1459 "at", 1460 "between", 1461 "by", 1462 "case", 1463 "cast", 1464 "collate", 1465 "contains", 1466 "create", 1467 "cross", 1468 "cube", 1469 "current", 1470 "default", 1471 "define", 1472 "desc", 1473 "distinct", 1474 "else", 1475 "end", 1476 "enum", 1477 "escape", 1478 "except", 1479 "exclude", 1480 "exists", 1481 "extract", 1482 "false", 1483 "fetch", 1484 "following", 1485 "for", 1486 "from", 1487 "full", 1488 "group", 1489 "grouping", 1490 "groups", 1491 "hash", 1492 "having", 1493 "if", 1494 "ignore", 1495 "in", 1496 "inner", 1497 "intersect", 1498 "interval", 1499 "into", 1500 "is", 1501 "join", 1502 "lateral", 1503 "left", 1504 "like", 1505 "limit", 1506 "lookup", 1507 "merge", 1508 "natural", 1509 "new", 1510 "no", 1511 "not", 1512 "null", 1513 "nulls", 1514 "of", 1515 "on", 1516 "or", 1517 "order", 1518 "outer", 1519 "over", 1520 "partition", 1521 "preceding", 1522 "proto", 1523 "qualify", 1524 "range", 1525 "recursive", 1526 "respect", 1527 "right", 1528 "rollup", 1529 "rows", 1530 "select", 1531 "set", 1532 "some", 1533 "struct", 1534 "tablesample", 1535 "then", 1536 "to", 1537 "treat", 1538 "true", 1539 "unbounded", 1540 "union", 1541 "unnest", 1542 "using", 1543 "when", 1544 "where", 1545 "window", 1546 "with", 1547 "within", 1548 } 1549 1550 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1551 unit = expression.unit 1552 unit_sql = unit.name if unit.is_string else self.sql(unit) 1553 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1554 1555 def mod_sql(self, expression: exp.Mod) -> str: 1556 this = expression.this 1557 expr = expression.expression 1558 return self.func( 1559 "MOD", 1560 this.unnest() if isinstance(this, exp.Paren) else this, 1561 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1562 ) 1563 1564 def column_parts(self, expression: exp.Column) -> str: 1565 if expression.meta.get("quoted_column"): 1566 # If a column reference is of the form `dataset.table`.name, we need 1567 # to preserve the quoted table path, otherwise the reference breaks 1568 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1569 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1570 return f"{table_path}.{self.sql(expression, 'this')}" 1571 1572 return super().column_parts(expression) 1573 1574 def table_parts(self, expression: exp.Table) -> str: 1575 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1576 # we need to make sure the correct quoting is used in each case. 1577 # 1578 # For example, if there is a CTE x that clashes with a schema name, then the former will 1579 # return the table y in that schema, whereas the latter will return the CTE's y column: 1580 # 1581 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1582 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1583 if expression.meta.get("quoted_table"): 1584 table_parts = ".".join(p.name for p in expression.parts) 1585 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1586 1587 return super().table_parts(expression) 1588 1589 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1590 this = expression.this 1591 if isinstance(this, exp.TsOrDsToDatetime): 1592 func_name = "FORMAT_DATETIME" 1593 elif isinstance(this, exp.TsOrDsToTimestamp): 1594 func_name = "FORMAT_TIMESTAMP" 1595 elif isinstance(this, exp.TsOrDsToTime): 1596 func_name = "FORMAT_TIME" 1597 else: 1598 func_name = "FORMAT_DATE" 1599 1600 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1601 return self.func( 1602 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1603 ) 1604 1605 def eq_sql(self, expression: exp.EQ) -> str: 1606 # Operands of = cannot be NULL in BigQuery 1607 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1608 if not isinstance(expression.parent, exp.Update): 1609 return "NULL" 1610 1611 return self.binary(expression, "=") 1612 1613 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1614 parent = expression.parent 1615 1616 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1617 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1618 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1619 return self.func( 1620 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1621 ) 1622 1623 return super().attimezone_sql(expression) 1624 1625 def trycast_sql(self, expression: exp.TryCast) -> str: 1626 return self.cast_sql(expression, safe_prefix="SAFE_") 1627 1628 def bracket_sql(self, expression: exp.Bracket) -> str: 1629 this = expression.this 1630 expressions = expression.expressions 1631 1632 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1633 arg = expressions[0] 1634 if arg.type is None: 1635 from sqlglot.optimizer.annotate_types import annotate_types 1636 1637 arg = annotate_types(arg, dialect=self.dialect) 1638 1639 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1640 # BQ doesn't support bracket syntax with string values for structs 1641 return f"{self.sql(this)}.{arg.name}" 1642 1643 expressions_sql = self.expressions(expression, flat=True) 1644 offset = expression.args.get("offset") 1645 1646 if offset == 0: 1647 expressions_sql = f"OFFSET({expressions_sql})" 1648 elif offset == 1: 1649 expressions_sql = f"ORDINAL({expressions_sql})" 1650 elif offset is not None: 1651 self.unsupported(f"Unsupported array offset: {offset}") 1652 1653 if expression.args.get("safe"): 1654 expressions_sql = f"SAFE_{expressions_sql}" 1655 1656 return f"{self.sql(this)}[{expressions_sql}]" 1657 1658 def in_unnest_op(self, expression: exp.Unnest) -> str: 1659 return self.sql(expression) 1660 1661 def version_sql(self, expression: exp.Version) -> str: 1662 if expression.name == "TIMESTAMP": 1663 expression.set("this", "SYSTEM_TIME") 1664 return super().version_sql(expression) 1665 1666 def contains_sql(self, expression: exp.Contains) -> str: 1667 this = expression.this 1668 expr = expression.expression 1669 1670 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1671 this = this.this 1672 expr = expr.this 1673 1674 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1675 1676 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1677 this = expression.this 1678 1679 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1680 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1681 # because they aren't literals and so the above syntax is invalid BigQuery. 1682 if isinstance(this, exp.Array): 1683 elem = seq_get(this.expressions, 0) 1684 if not (elem and elem.find(exp.Query)): 1685 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1686 1687 return super().cast_sql(expression, safe_prefix=safe_prefix) 1688 1689 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1690 variables = self.expressions(expression, "this") 1691 default = self.sql(expression, "default") 1692 default = f" DEFAULT {default}" if default else "" 1693 kind = self.sql(expression, "kind") 1694 kind = f" {kind}" if kind else "" 1695 1696 return f"{variables}{kind}{default}"
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHEREclause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1564 def column_parts(self, expression: exp.Column) -> str: 1565 if expression.meta.get("quoted_column"): 1566 # If a column reference is of the form `dataset.table`.name, we need 1567 # to preserve the quoted table path, otherwise the reference breaks 1568 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1569 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1570 return f"{table_path}.{self.sql(expression, 'this')}" 1571 1572 return super().column_parts(expression)
1574 def table_parts(self, expression: exp.Table) -> str: 1575 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1576 # we need to make sure the correct quoting is used in each case. 1577 # 1578 # For example, if there is a CTE x that clashes with a schema name, then the former will 1579 # return the table y in that schema, whereas the latter will return the CTE's y column: 1580 # 1581 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1582 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1583 if expression.meta.get("quoted_table"): 1584 table_parts = ".".join(p.name for p in expression.parts) 1585 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1586 1587 return super().table_parts(expression)
1589 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1590 this = expression.this 1591 if isinstance(this, exp.TsOrDsToDatetime): 1592 func_name = "FORMAT_DATETIME" 1593 elif isinstance(this, exp.TsOrDsToTimestamp): 1594 func_name = "FORMAT_TIMESTAMP" 1595 elif isinstance(this, exp.TsOrDsToTime): 1596 func_name = "FORMAT_TIME" 1597 else: 1598 func_name = "FORMAT_DATE" 1599 1600 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1601 return self.func( 1602 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1603 )
1605 def eq_sql(self, expression: exp.EQ) -> str: 1606 # Operands of = cannot be NULL in BigQuery 1607 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1608 if not isinstance(expression.parent, exp.Update): 1609 return "NULL" 1610 1611 return self.binary(expression, "=")
1613 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1614 parent = expression.parent 1615 1616 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1617 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1618 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1619 return self.func( 1620 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1621 ) 1622 1623 return super().attimezone_sql(expression)
1628 def bracket_sql(self, expression: exp.Bracket) -> str: 1629 this = expression.this 1630 expressions = expression.expressions 1631 1632 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1633 arg = expressions[0] 1634 if arg.type is None: 1635 from sqlglot.optimizer.annotate_types import annotate_types 1636 1637 arg = annotate_types(arg, dialect=self.dialect) 1638 1639 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1640 # BQ doesn't support bracket syntax with string values for structs 1641 return f"{self.sql(this)}.{arg.name}" 1642 1643 expressions_sql = self.expressions(expression, flat=True) 1644 offset = expression.args.get("offset") 1645 1646 if offset == 0: 1647 expressions_sql = f"OFFSET({expressions_sql})" 1648 elif offset == 1: 1649 expressions_sql = f"ORDINAL({expressions_sql})" 1650 elif offset is not None: 1651 self.unsupported(f"Unsupported array offset: {offset}") 1652 1653 if expression.args.get("safe"): 1654 expressions_sql = f"SAFE_{expressions_sql}" 1655 1656 return f"{self.sql(this)}[{expressions_sql}]"
1666 def contains_sql(self, expression: exp.Contains) -> str: 1667 this = expression.this 1668 expr = expression.expression 1669 1670 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1671 this = this.this 1672 expr = expr.this 1673 1674 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope"))
1676 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1677 this = expression.this 1678 1679 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1680 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1681 # because they aren't literals and so the above syntax is invalid BigQuery. 1682 if isinstance(this, exp.Array): 1683 elem = seq_get(this.expressions, 0) 1684 if not (elem and elem.find(exp.Query)): 1685 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1686 1687 return super().cast_sql(expression, safe_prefix=safe_prefix)
1689 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1690 variables = self.expressions(expression, "this") 1691 default = self.sql(expression, "default") 1692 default = f" DEFAULT {default}" if default else "" 1693 kind = self.sql(expression, "kind") 1694 kind = f" {kind}" if kind else "" 1695 1696 return f"{variables}{kind}{default}"
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- UNICODE_SUBSTITUTE
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- SUPPORTS_LIKE_QUANTIFIERS
- MATCH_AGAINST_TABLE_PREFIX
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- is_sql
- like_sql
- ilike_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- space_sql
- buildproperty_sql
- refreshtriggerproperty_sql