From c0cb6b5d8ed2333660e06aed00dddf57719e77ca Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 20 May 2026 22:41:35 +0000 Subject: [PATCH 01/21] feat(bigframes): Defer unnamed @udf deployment until needed --- .../bigframes/functions/_function_session.py | 127 +++++++++++++++--- .../bigframes/bigframes/functions/function.py | 8 +- .../bigframes/bigframes/functions/udf_def.py | 56 +++++++- .../bigframes/bigframes/session/__init__.py | 2 +- .../bigframes/session/bq_caching_executor.py | 75 +++++++++++ .../large/functions/test_managed_function.py | 63 +++++++++ .../unit/functions/test_remote_function.py | 63 +++++++++ 7 files changed, 372 insertions(+), 22 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 186ecbcf3e1f..27b0a250db65 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -64,6 +64,9 @@ def __init__(self): # Lock to synchronize the update of the session artifacts self._artifacts_lock = threading.Lock() + self._deployed_routines: set[bytes] = set() + self._deploying_routines: set[bytes] = set() + def _resolve_session(self, session: Optional[Session]) -> Session: """Resolves the BigFrames session.""" import bigframes.pandas as bpd @@ -191,6 +194,83 @@ def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): with self._artifacts_lock: self._temp_artifacts[bqrf_routine] = gcf_path + def deploy_undeployed_udf( + self, + session: Session, + bq_udf: udf_def.PythonUdf, + ) -> udf_def.BigqueryUdf: + """Deploys a UDF to BigQuery if not already deployed.""" + udf_hash = bq_udf.stable_hash() + import time + + bigquery_client = self._resolve_bigquery_client(session, None) + bq_connection_manager = session.bqconnectionmanager + dataset_ref = self._resolve_dataset_reference(session, bigquery_client, None) + bq_location, _ = _utils.get_remote_function_locations( + bigquery_client.location + ) + + managed_function_client = _function_client.FunctionClient( + dataset_ref.project, + bq_location, + dataset_ref.dataset_id, + bigquery_client, + bq_connection_manager, + session=session, + ) + + config = bq_udf.to_managed_function_config() + bq_function_name = _function_client.get_managed_function_name( + config, session.session_id + ) + full_rf_name = ( + managed_function_client.get_remote_function_fully_qualilfied_name( + bq_function_name + ) + ) + routine_ref = bigquery.RoutineReference.from_string(full_rf_name) + + with self._artifacts_lock: + if udf_hash in self._deployed_routines: + return udf_def.BigqueryUdf( + routine_ref=routine_ref, + signature=bq_udf.signature, + ) + + while True: + with self._artifacts_lock: + if udf_hash in self._deployed_routines: + return udf_def.BigqueryUdf( + routine_ref=routine_ref, + signature=bq_udf.signature, + ) + + if udf_hash not in self._deploying_routines: + self._deploying_routines.add(udf_hash) + break + + time.sleep(0.2) + + try: + managed_function_client.provision_bq_managed_function( + name=bq_function_name, + config=config, + ) + except Exception: + with self._artifacts_lock: + self._deploying_routines.discard(udf_hash) + raise + + with self._artifacts_lock: + self._deploying_routines.discard(udf_hash) + self._deployed_routines.add(udf_hash) + self._temp_artifacts[full_rf_name] = "" + + return udf_def.BigqueryUdf( + routine_ref=routine_ref, + signature=bq_udf.signature, + ) + def clean_up( self, bqclient: bigquery.Client, @@ -679,6 +759,8 @@ def udf( max_batching_rows: Optional[int] = None, container_cpu: Optional[float] = None, container_memory: Optional[str] = None, + *, + _force_deploy: bool = False, ): """Decorator to turn a Python user defined function (udf) into a BigQuery managed function. @@ -835,27 +917,42 @@ def wrapper(func): capture_references=False, ) - bq_function_name = managed_function_client.provision_bq_managed_function( - name=name, - config=config, - ) - full_rf_name = ( - managed_function_client.get_remote_function_fully_qualilfied_name( - bq_function_name - ) + requirements = udf_def.RuntimeRequirements( + container_cpu=container_cpu, + container_memory=container_memory, + bq_connection_id=bq_connection_id, + max_batching_rows=max_batching_rows, + packages=tuple(packages) if packages else (), ) - udf_definition = udf_def.BigqueryUdf( - routine_ref=bigquery.RoutineReference.from_string(full_rf_name), - signature=udf_sig, - ) + if not name and not _force_deploy: # session-owned resource - deferred deployment + udf_definition = udf_def.PythonUdf( + signature=udf_sig, + code=code_def, + requirements=requirements, + ) + else: + bq_function_name = managed_function_client.provision_bq_managed_function( + name=name, + config=config, + ) + full_rf_name = ( + managed_function_client.get_remote_function_fully_qualilfied_name( + bq_function_name + ) + ) + udf_definition = udf_def.BigqueryUdf( + routine_ref=bigquery.RoutineReference.from_string(full_rf_name), + signature=udf_sig, + ) if udf_sig.is_row_processor: msg = bfe.format_message("input_types=Series is in preview.") warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) if not name: # session-owned resource - will be cleaned up automatically - self._update_temp_artifacts(full_rf_name, "") + if _force_deploy: + self._update_temp_artifacts(full_rf_name, "") return bq_functions.UdfRoutine(func=func, _udf_def=udf_definition) # user-managed permanent resource - will not be cleaned up automatically @@ -888,9 +985,7 @@ def deploy_udf( A wrapped Python user defined function, usable in :meth:`~bigframes.series.Series.apply`. """ - # TODO(tswast): If we update udf to defer deployment, update this method - # to deploy immediately. - return self.udf(**kwargs)(func) + return self.udf(_force_deploy=True, **kwargs)(func) def _resolve_signature( diff --git a/packages/bigframes/bigframes/functions/function.py b/packages/bigframes/bigframes/functions/function.py index dc0fa55c8e7b..49b59f4360b7 100644 --- a/packages/bigframes/bigframes/functions/function.py +++ b/packages/bigframes/bigframes/functions/function.py @@ -16,7 +16,7 @@ import dataclasses import logging -from typing import TYPE_CHECKING, Callable, Optional, Protocol, runtime_checkable +from typing import TYPE_CHECKING, Callable, Optional, Protocol, runtime_checkable, Union import google.api_core.exceptions from google.cloud import bigquery @@ -162,7 +162,7 @@ class Udf(Protocol): """ @property - def udf_def(self) -> udf_def.BigqueryUdf: ... + def udf_def(self) -> Union[udf_def.BigqueryUdf, udf_def.PythonUdf]: ... class BigqueryCallableRoutine: @@ -242,11 +242,11 @@ class UdfRoutine: func: Callable # Try not to depend on this, bq managed function creation will be deferred later # And this ref will be replaced with requirements rather to support lazy creation - _udf_def: udf_def.BigqueryUdf + _udf_def: Union[udf_def.BigqueryUdf, udf_def.PythonUdf] def __call__(self, *args, **kwargs): return self.func(*args, **kwargs) @property - def udf_def(self) -> udf_def.BigqueryUdf: + def udf_def(self) -> Union[udf_def.BigqueryUdf, udf_def.PythonUdf]: return self._udf_def diff --git a/packages/bigframes/bigframes/functions/udf_def.py b/packages/bigframes/bigframes/functions/udf_def.py index fbe000f608fd..f85b04edd066 100644 --- a/packages/bigframes/bigframes/functions/udf_def.py +++ b/packages/bigframes/bigframes/functions/udf_def.py @@ -371,12 +371,35 @@ def stable_hash(self) -> bytes: return hash_val.digest() +@dataclasses.dataclass(frozen=True) +class RuntimeRequirements: + container_cpu: Optional[float] = None + container_memory: Optional[str] = None + bq_connection_id: Optional[str] = None + max_batching_rows: Optional[int] = None + packages: tuple[str, ...] = () + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + if self.container_cpu is not None: + hash_val.update(str(self.container_cpu).encode()) + if self.container_memory is not None: + hash_val.update(str(self.container_memory).encode()) + if self.bq_connection_id is not None: + hash_val.update(str(self.bq_connection_id).encode()) + if self.max_batching_rows is not None: + hash_val.update(str(self.max_batching_rows).encode()) + if self.packages: + for p in sorted(self.packages): + hash_val.update(p.encode()) + return hash_val.digest() + + @dataclasses.dataclass(frozen=True) class BigqueryUdf: """ Represents the information needed to call a BigQuery remote function - not a full spec. """ - routine_ref: bigquery.RoutineReference = dataclasses.field() signature: UdfSignature @@ -398,6 +421,37 @@ def from_routine( return cls(routine.reference, signature=signature) +@dataclasses.dataclass(frozen=True) +class PythonUdf: + """ + Represents user-requested Python UDF semantics, including the code and runtime requirements. + """ + signature: UdfSignature + code: CodeDef + requirements: RuntimeRequirements = dataclasses.field( + default_factory=RuntimeRequirements + ) + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.code.stable_hash()) + hash_val.update(self.signature.stable_hash()) + hash_val.update(self.requirements.stable_hash()) + return hash_val.digest() + + def to_managed_function_config(self) -> ManagedFunctionConfig: + return ManagedFunctionConfig( + code=self.code, + signature=self.signature, + max_batching_rows=self.requirements.max_batching_rows, + container_cpu=self.requirements.container_cpu, + container_memory=self.requirements.container_memory, + bq_connection_id=self.requirements.bq_connection_id, + capture_references=False, + ) + + + @dataclasses.dataclass(frozen=True) class CodeDef: # Produced by cloudpickle, not compatible across python versions diff --git a/packages/bigframes/bigframes/session/__init__.py b/packages/bigframes/bigframes/session/__init__.py index 38e92a60321b..92e032bc31e4 100644 --- a/packages/bigframes/bigframes/session/__init__.py +++ b/packages/bigframes/bigframes/session/__init__.py @@ -1958,7 +1958,7 @@ def udf( output_type: Optional[type] = None, dataset: str, bigquery_connection: Optional[str] = None, - name: str, + name: Optional[str] = None, packages: Optional[Sequence[str]] = None, max_batching_rows: Optional[int] = None, container_cpu: Optional[float] = None, diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index 19f26bd9c322..037392f107f3 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -510,12 +510,87 @@ def _prepare_plan_simplify(self, plan: nodes.BigFrameNode) -> nodes.BigFrameNode plan = plan.top_down(rewrite.fold_row_counts) return plan + async def _deploy_undeployed_udfs(self, plan: nodes.BigFrameNode) -> nodes.BigFrameNode: + import dataclasses + import bigframes.core.expression as expression + import bigframes.functions.udf_def as udf_def + import bigframes.operations as ops + + undeployed_udfs: list[udf_def.PythonUdf] = [] + for node in plan.unique_nodes(): + for expr in node._node_expressions: + for sub_expr in expr.walk(): + if isinstance(sub_expr, expression.OpExpression): + op = sub_expr.op + if isinstance( + op, + ( + ops.RemoteFunctionOp, + ops.BinaryRemoteFunctionOp, + ops.NaryRemoteFunctionOp, + ), + ): + func_def = op.function_def + if isinstance(func_def, udf_def.PythonUdf): + undeployed_udfs.append(func_def) + + if not undeployed_udfs: + return plan + + # Deduplicate while preserving order + seen = set() + unique_undeployed_udfs = [] + for udf in undeployed_udfs: + if udf not in seen: + seen.add(udf) + unique_undeployed_udfs.append(udf) + + session = self.loader._session + deployed_mapping: dict[udf_def.PythonUdf, udf_def.BigqueryUdf] = {} + for udf in unique_undeployed_udfs: + deployed_udf = await asyncio.to_thread( + session._function_session.deploy_undeployed_udf, + session, + udf, + ) + deployed_mapping[udf] = deployed_udf + + # Now rewrite the plan using bottom_up to substitute the UDF definitions! + def replace_in_expr(expr: expression.Expression) -> expression.Expression: + def replace_step(e: expression.Expression) -> expression.Expression: + if isinstance(e, expression.OpExpression): + op = e.op + if isinstance( + op, + ( + ops.RemoteFunctionOp, + ops.BinaryRemoteFunctionOp, + ops.NaryRemoteFunctionOp, + ), + ): + func_def = op.function_def + if func_def in deployed_mapping: + new_func_def = deployed_mapping[func_def] + new_op = dataclasses.replace(op, function_def=new_func_def) + return dataclasses.replace(e, op=new_op) + return e + + return expr.bottom_up(replace_step) + + def replace_in_node(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + if hasattr(node, "transform_exprs"): + return node.transform_exprs(replace_in_expr) + return node + + return plan.bottom_up(replace_in_node) + async def _prepare_plan_bq_execution( self, plan: nodes.BigFrameNode, compute_options: Optional[ex_spec.BqComputeOptions] = None, ) -> nodes.BigFrameNode: """Prepare the plan for BigQuery execution by caching subtrees and uploading large local sources.""" + plan = await self._deploy_undeployed_udfs(plan) if compute_options is not None and compute_options.enable_multi_query_execution: await self._simplify_with_caching(plan, compute_options=compute_options) plan = self._prepare_plan_simplify(plan) diff --git a/packages/bigframes/tests/system/large/functions/test_managed_function.py b/packages/bigframes/tests/system/large/functions/test_managed_function.py index 0c2e3d8fe895..d735cc407b9b 100644 --- a/packages/bigframes/tests/system/large/functions/test_managed_function.py +++ b/packages/bigframes/tests/system/large/functions/test_managed_function.py @@ -1128,3 +1128,66 @@ def foo_list(x: int, y0: float, y1: bytes, y2: bool) -> list[str]: # Ignore any dtype difference. pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_deferred_unnamed_udf_execution(session, dataset_id, scalars_dfs): + import bigframes.functions.udf_def as udf_def + + # Create an unnamed UDF (name=None) + @session.udf(dataset=dataset_id) + def unnamed_multiplier(x: int) -> int: + return x * 3 + + # 1. Assert it is represented as a PythonUdf (not deployed yet) + assert isinstance(unnamed_multiplier.udf_def, udf_def.PythonUdf) + + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df["int64_too"] + pd_series = scalars_pandas_df["int64_too"] + + # 2. Applying it triggers deployment behind the scenes! + bf_result = bf_series.apply(unnamed_multiplier).to_pandas() + pd_result = pd_series.apply(lambda x: x * 3) + + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + # 3. Verify that the deployed routine name matches our stable hash and exists in BigQuery + import bigframes.functions._function_client as bff_client + config = unnamed_multiplier.udf_def.to_managed_function_config() + expected_routine_name = bff_client.get_managed_function_name(config, session.session_id) + routine = session.bqclient.get_routine(f"{session._anonymous_dataset.project}.{session._anonymous_dataset.dataset_id}.{expected_routine_name}") + assert routine is not None + + +def test_deferred_udf_with_runtime_requirements(session, dataset_id, scalars_dfs): + import bigframes.functions.udf_def as udf_def + + # Create an unnamed UDF with custom options + @session.udf( + dataset=dataset_id, + container_cpu=1, + container_memory="2Gi", + max_batching_rows=25, + ) + def heavy_unnamed_udf(x: int) -> int: + return x + 100 + + assert isinstance(heavy_unnamed_udf.udf_def, udf_def.PythonUdf) + + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df["int64_too"] + pd_series = scalars_pandas_df["int64_too"] + + bf_result = bf_series.apply(heavy_unnamed_udf).to_pandas() + pd_result = pd_series.apply(lambda x: x + 100) + + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + # Verify it was deployed with the correct runtime options + import bigframes.functions._function_client as bff_client + config = heavy_unnamed_udf.udf_def.to_managed_function_config() + expected_routine_name = bff_client.get_managed_function_name(config, session.session_id) + routine = session.bqclient.get_routine(f"{session._anonymous_dataset.project}.{session._anonymous_dataset.dataset_id}.{expected_routine_name}") + assert routine._properties["externalRuntimeOptions"]["containerCpu"] == 1 + assert routine._properties["externalRuntimeOptions"]["containerMemory"] == "2Gi" + assert routine._properties["externalRuntimeOptions"]["maxBatchingRows"] == "25" diff --git a/packages/bigframes/tests/unit/functions/test_remote_function.py b/packages/bigframes/tests/unit/functions/test_remote_function.py index 17c04b338385..a6bd0cab3cee 100644 --- a/packages/bigframes/tests/unit/functions/test_remote_function.py +++ b/packages/bigframes/tests/unit/functions/test_remote_function.py @@ -75,3 +75,66 @@ def my_remote_func(x: int) -> int: # Test that the function would have been deployed somewhere. assert "my_custom_name" in deployed.bigframes_bigquery_function + + +def test_deferred_udf_execution(): + import bigframes.functions.udf_def as udf_def + import google.cloud.bigquery + + session = mocks.create_bigquery_session() + @session._function_session.udf(session=session) + def my_unnamed_udf(x: int) -> int: + return x * 2 + + # 1. Verify that no BQ query was executed to deploy the UDF during registration! + session._queries.clear() + assert len(session._queries) == 0 + + # 2. Verify that it created a PythonUdf + assert isinstance(my_unnamed_udf.udf_def, udf_def.PythonUdf) + + # 3. Verify that when calling the UDF via a query, it triggers the UDF deployment query! + import bigframes.core.nodes as nodes + import bigframes.core.expression as ex + import bigframes.operations as ops + + # Let's construct an expression using our UDF + udf_op = ops.RemoteFunctionOp(function_def=my_unnamed_udf.udf_def, apply_on_null=False) + expr = ex.OpExpression(op=udf_op, inputs=(ex.const(5),)) + + class MockNode: + def __init__(self, exprs): + self._node_expressions = exprs + self.child_nodes = [] + + def unique_nodes(self): + yield self + + def bottom_up(self, transform): + return transform(self) + + def transform_exprs(self, fn): + return MockNode([fn(e) for e in self._node_expressions]) + + mock_node = MockNode([expr]) + + import asyncio + # Deploy and replace definition in the plan + new_plan = asyncio.run(session._executor._ibis_executor._deploy_undeployed_udfs(mock_node)) + + # Verify that the DDL to create the function was executed! + assert len(session._queries) > 0 + assert any("CREATE OR REPLACE FUNCTION" in q for q in session._queries) + + # 4. Verify that the definition in the plan has been replaced with BigqueryUdf + new_expr = new_plan._node_expressions[0] + new_op = new_expr.op + assert isinstance(new_op.function_def, udf_def.BigqueryUdf) + assert new_op.function_def.routine_ref is not None + + # 5. Verify memoization: Deploying the new plan again executes ZERO additional DDL queries! + session._queries.clear() + new_plan_2 = asyncio.run(session._executor._ibis_executor._deploy_undeployed_udfs(new_plan)) + assert len(session._queries) == 0 + assert new_plan_2 == new_plan + From 09552a8b2a84601ba331587f150b44a305ebf639 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 20 May 2026 23:10:30 +0000 Subject: [PATCH 02/21] fixes --- .../bigframes/functions/_function_session.py | 16 ++++++++------- .../bigframes/bigframes/functions/function.py | 2 +- .../bigframes/bigframes/functions/udf_def.py | 3 ++- .../bigframes/bigframes/pandas/__init__.py | 2 +- .../bigframes/session/bq_caching_executor.py | 5 ++++- .../large/functions/test_managed_function.py | 18 +++++++++++++---- .../unit/functions/test_remote_function.py | 20 +++++++++++++------ 7 files changed, 45 insertions(+), 21 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 27b0a250db65..b02fb665c3f7 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -206,9 +206,7 @@ def deploy_undeployed_udf( bigquery_client = self._resolve_bigquery_client(session, None) bq_connection_manager = session.bqconnectionmanager dataset_ref = self._resolve_dataset_reference(session, bigquery_client, None) - bq_location, _ = _utils.get_remote_function_locations( - bigquery_client.location - ) + bq_location, _ = _utils.get_remote_function_locations(bigquery_client.location) managed_function_client = _function_client.FunctionClient( dataset_ref.project, @@ -925,16 +923,20 @@ def wrapper(func): packages=tuple(packages) if packages else (), ) - if not name and not _force_deploy: # session-owned resource - deferred deployment + if ( + not name and not _force_deploy + ): # session-owned resource - deferred deployment udf_definition = udf_def.PythonUdf( signature=udf_sig, code=code_def, requirements=requirements, ) else: - bq_function_name = managed_function_client.provision_bq_managed_function( - name=name, - config=config, + bq_function_name = ( + managed_function_client.provision_bq_managed_function( + name=name, + config=config, + ) ) full_rf_name = ( managed_function_client.get_remote_function_fully_qualilfied_name( diff --git a/packages/bigframes/bigframes/functions/function.py b/packages/bigframes/bigframes/functions/function.py index 49b59f4360b7..e9a40f415324 100644 --- a/packages/bigframes/bigframes/functions/function.py +++ b/packages/bigframes/bigframes/functions/function.py @@ -16,7 +16,7 @@ import dataclasses import logging -from typing import TYPE_CHECKING, Callable, Optional, Protocol, runtime_checkable, Union +from typing import TYPE_CHECKING, Callable, Optional, Protocol, Union, runtime_checkable import google.api_core.exceptions from google.cloud import bigquery diff --git a/packages/bigframes/bigframes/functions/udf_def.py b/packages/bigframes/bigframes/functions/udf_def.py index f85b04edd066..b95dafc4253b 100644 --- a/packages/bigframes/bigframes/functions/udf_def.py +++ b/packages/bigframes/bigframes/functions/udf_def.py @@ -400,6 +400,7 @@ class BigqueryUdf: """ Represents the information needed to call a BigQuery remote function - not a full spec. """ + routine_ref: bigquery.RoutineReference = dataclasses.field() signature: UdfSignature @@ -426,6 +427,7 @@ class PythonUdf: """ Represents user-requested Python UDF semantics, including the code and runtime requirements. """ + signature: UdfSignature code: CodeDef requirements: RuntimeRequirements = dataclasses.field( @@ -451,7 +453,6 @@ def to_managed_function_config(self) -> ManagedFunctionConfig: ) - @dataclasses.dataclass(frozen=True) class CodeDef: # Produced by cloudpickle, not compatible across python versions diff --git a/packages/bigframes/bigframes/pandas/__init__.py b/packages/bigframes/bigframes/pandas/__init__.py index 34ec3037e92f..082a00438f42 100644 --- a/packages/bigframes/bigframes/pandas/__init__.py +++ b/packages/bigframes/bigframes/pandas/__init__.py @@ -202,7 +202,7 @@ def udf( output_type: Optional[type] = None, dataset: str, bigquery_connection: Optional[str] = None, - name: str, + name: Optional[str] = None, packages: Optional[Sequence[str]] = None, max_batching_rows: Optional[int] = None, container_cpu: Optional[float] = None, diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index 037392f107f3..4543b24e2512 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -510,8 +510,11 @@ def _prepare_plan_simplify(self, plan: nodes.BigFrameNode) -> nodes.BigFrameNode plan = plan.top_down(rewrite.fold_row_counts) return plan - async def _deploy_undeployed_udfs(self, plan: nodes.BigFrameNode) -> nodes.BigFrameNode: + async def _deploy_undeployed_udfs( + self, plan: nodes.BigFrameNode + ) -> nodes.BigFrameNode: import dataclasses + import bigframes.core.expression as expression import bigframes.functions.udf_def as udf_def import bigframes.operations as ops diff --git a/packages/bigframes/tests/system/large/functions/test_managed_function.py b/packages/bigframes/tests/system/large/functions/test_managed_function.py index d735cc407b9b..e93d2bb068be 100644 --- a/packages/bigframes/tests/system/large/functions/test_managed_function.py +++ b/packages/bigframes/tests/system/large/functions/test_managed_function.py @@ -1153,9 +1153,14 @@ def unnamed_multiplier(x: int) -> int: # 3. Verify that the deployed routine name matches our stable hash and exists in BigQuery import bigframes.functions._function_client as bff_client + config = unnamed_multiplier.udf_def.to_managed_function_config() - expected_routine_name = bff_client.get_managed_function_name(config, session.session_id) - routine = session.bqclient.get_routine(f"{session._anonymous_dataset.project}.{session._anonymous_dataset.dataset_id}.{expected_routine_name}") + expected_routine_name = bff_client.get_managed_function_name( + config, session.session_id + ) + routine = session.bqclient.get_routine( + f"{session._anonymous_dataset.project}.{session._anonymous_dataset.dataset_id}.{expected_routine_name}" + ) assert routine is not None @@ -1185,9 +1190,14 @@ def heavy_unnamed_udf(x: int) -> int: # Verify it was deployed with the correct runtime options import bigframes.functions._function_client as bff_client + config = heavy_unnamed_udf.udf_def.to_managed_function_config() - expected_routine_name = bff_client.get_managed_function_name(config, session.session_id) - routine = session.bqclient.get_routine(f"{session._anonymous_dataset.project}.{session._anonymous_dataset.dataset_id}.{expected_routine_name}") + expected_routine_name = bff_client.get_managed_function_name( + config, session.session_id + ) + routine = session.bqclient.get_routine( + f"{session._anonymous_dataset.project}.{session._anonymous_dataset.dataset_id}.{expected_routine_name}" + ) assert routine._properties["externalRuntimeOptions"]["containerCpu"] == 1 assert routine._properties["externalRuntimeOptions"]["containerMemory"] == "2Gi" assert routine._properties["externalRuntimeOptions"]["maxBatchingRows"] == "25" diff --git a/packages/bigframes/tests/unit/functions/test_remote_function.py b/packages/bigframes/tests/unit/functions/test_remote_function.py index a6bd0cab3cee..4490bf36caae 100644 --- a/packages/bigframes/tests/unit/functions/test_remote_function.py +++ b/packages/bigframes/tests/unit/functions/test_remote_function.py @@ -78,10 +78,12 @@ def my_remote_func(x: int) -> int: def test_deferred_udf_execution(): - import bigframes.functions.udf_def as udf_def import google.cloud.bigquery + import bigframes.functions.udf_def as udf_def + session = mocks.create_bigquery_session() + @session._function_session.udf(session=session) def my_unnamed_udf(x: int) -> int: return x * 2 @@ -94,12 +96,14 @@ def my_unnamed_udf(x: int) -> int: assert isinstance(my_unnamed_udf.udf_def, udf_def.PythonUdf) # 3. Verify that when calling the UDF via a query, it triggers the UDF deployment query! - import bigframes.core.nodes as nodes import bigframes.core.expression as ex + import bigframes.core.nodes as nodes import bigframes.operations as ops # Let's construct an expression using our UDF - udf_op = ops.RemoteFunctionOp(function_def=my_unnamed_udf.udf_def, apply_on_null=False) + udf_op = ops.RemoteFunctionOp( + function_def=my_unnamed_udf.udf_def, apply_on_null=False + ) expr = ex.OpExpression(op=udf_op, inputs=(ex.const(5),)) class MockNode: @@ -119,8 +123,11 @@ def transform_exprs(self, fn): mock_node = MockNode([expr]) import asyncio + # Deploy and replace definition in the plan - new_plan = asyncio.run(session._executor._ibis_executor._deploy_undeployed_udfs(mock_node)) + new_plan = asyncio.run( + session._executor._ibis_executor._deploy_undeployed_udfs(mock_node) + ) # Verify that the DDL to create the function was executed! assert len(session._queries) > 0 @@ -134,7 +141,8 @@ def transform_exprs(self, fn): # 5. Verify memoization: Deploying the new plan again executes ZERO additional DDL queries! session._queries.clear() - new_plan_2 = asyncio.run(session._executor._ibis_executor._deploy_undeployed_udfs(new_plan)) + new_plan_2 = asyncio.run( + session._executor._ibis_executor._deploy_undeployed_udfs(new_plan) + ) assert len(session._queries) == 0 assert new_plan_2 == new_plan - From 5c660d5dcc5aedf89063abdeb2b840a0d6a16ad9 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 21 May 2026 20:53:55 +0000 Subject: [PATCH 03/21] simplify remote function op defs --- .../ibis_compiler/scalar_op_registry.py | 36 ---------- .../sqlglot/expressions/generic_ops.py | 23 ------ .../bigframes/bigframes/core/rewrite/udfs.py | 43 +---------- packages/bigframes/bigframes/dataframe.py | 26 +++---- .../bigframes/functions/_function_session.py | 4 +- .../bigframes/operations/__init__.py | 9 +-- .../operations/remote_function_ops.py | 27 ++----- .../bigframes/bigframes/operations/to_op.py | 39 ++++++++++ packages/bigframes/bigframes/series.py | 19 ++--- .../bigframes/session/bq_caching_executor.py | 63 ++++++++-------- .../sqlglot/expressions/test_generic_ops.py | 57 +-------------- .../unit/functions/test_remote_function.py | 71 ------------------- 12 files changed, 97 insertions(+), 320 deletions(-) create mode 100644 packages/bigframes/bigframes/operations/to_op.py diff --git a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index e9e8435ea1a1..b5b1e515cefa 100644 --- a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1039,42 +1039,6 @@ def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): udf_sig = op.function_def.signature assert not udf_sig.is_virtual # should have been devirtualized in lowering pass ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) - - @ibis_udf.scalar.builtin( - name=str(op.function_def.routine_ref), signature=ibis_py_sig - ) - def udf(input): ... - - x_transformed = udf(x) - if not op.apply_on_null: - return ibis_api.case().when(x.isnull(), x).else_(x_transformed).end() - return x_transformed - - -@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True) -def binary_remote_function_op_impl( - x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp -): - udf_sig = op.function_def.signature - assert not udf_sig.is_virtual # should have been devirtualized in lowering pass - ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) - - @ibis_udf.scalar.builtin( - name=str(op.function_def.routine_ref), signature=ibis_py_sig - ) - def udf(input1, input2): ... - - x_transformed = udf(x, y) - return x_transformed - - -@scalar_op_compiler.register_nary_op(ops.NaryRemoteFunctionOp, pass_op=True) -def nary_remote_function_op_impl( - *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp -): - udf_sig = op.function_def.signature - assert not udf_sig.is_virtual # should have been devirtualized in lowering pass - ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) arg_names = tuple(arg.name for arg in udf_sig.inputs) @ibis_udf.scalar.builtin( diff --git a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py index a6cfb52685a0..292bad714098 100644 --- a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py +++ b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py @@ -188,29 +188,6 @@ def _get_remote_function_name(op): @register_unary_op(ops.RemoteFunctionOp, pass_op=True) def _(expr: TypedExpr, op: ops.RemoteFunctionOp) -> sge.Expression: - func_name = _get_remote_function_name(op) - func = sge.func(func_name, expr.expr) - - if not op.apply_on_null: - return sge.If( - this=sge.Is(this=expr.expr, expression=sge.Null()), - true=expr.expr, - false=func, - ) - - return func - - -@register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True) -def _( - left: TypedExpr, right: TypedExpr, op: ops.BinaryRemoteFunctionOp -) -> sge.Expression: - func_name = _get_remote_function_name(op) - return sge.func(func_name, left.expr, right.expr) - - -@register_nary_op(ops.NaryRemoteFunctionOp, pass_op=True) -def _(*operands: TypedExpr, op: ops.NaryRemoteFunctionOp) -> sge.Expression: func_name = _get_remote_function_name(op) return sge.func(func_name, *(operand.expr for operand in operands)) diff --git a/packages/bigframes/bigframes/core/rewrite/udfs.py b/packages/bigframes/bigframes/core/rewrite/udfs.py index 284ac4217c09..286a9d9d9401 100644 --- a/packages/bigframes/bigframes/core/rewrite/udfs.py +++ b/packages/bigframes/bigframes/core/rewrite/udfs.py @@ -32,7 +32,6 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: func_def = expr.op.function_def devirtualized_expr = ops.RemoteFunctionOp( func_def.with_devirtualize(), - apply_on_null=expr.op.apply_on_null, ).as_expr(*expr.children) if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): return func_def.signature.output.out_expr(devirtualized_expr) @@ -40,47 +39,7 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: return devirtualized_expr -@dataclasses.dataclass -class LowerBinaryRemoteFunctionRule(op_lowering.OpLoweringRule): - @property - def op(self) -> type[ops.ScalarOp]: - return ops.BinaryRemoteFunctionOp - - def lower(self, expr: expression.OpExpression) -> expression.Expression: - assert isinstance(expr.op, ops.BinaryRemoteFunctionOp) - func_def = expr.op.function_def - devirtualized_expr = ops.BinaryRemoteFunctionOp( - func_def.with_devirtualize(), - ).as_expr(*expr.children) - if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): - return func_def.signature.output.out_expr(devirtualized_expr) - else: - return devirtualized_expr - - -@dataclasses.dataclass -class LowerNaryRemoteFunctionRule(op_lowering.OpLoweringRule): - @property - def op(self) -> type[ops.ScalarOp]: - return ops.NaryRemoteFunctionOp - - def lower(self, expr: expression.OpExpression) -> expression.Expression: - assert isinstance(expr.op, ops.NaryRemoteFunctionOp) - func_def = expr.op.function_def - devirtualized_expr = ops.NaryRemoteFunctionOp( - func_def.with_devirtualize(), - ).as_expr(*expr.children) - if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): - return func_def.signature.output.out_expr(devirtualized_expr) - else: - return devirtualized_expr - - -UDF_LOWERING_RULES = ( - LowerRemoteFunctionRule(), - LowerBinaryRemoteFunctionRule(), - LowerNaryRemoteFunctionRule(), -) +UDF_LOWERING_RULES = (LowerRemoteFunctionRule(),) def lower_udfs(root: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: diff --git a/packages/bigframes/bigframes/dataframe.py b/packages/bigframes/bigframes/dataframe.py index d7755517293e..944c70fbf828 100644 --- a/packages/bigframes/bigframes/dataframe.py +++ b/packages/bigframes/bigframes/dataframe.py @@ -4694,12 +4694,10 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: if na_action not in {None, "ignore"}: raise ValueError(f"na_action={na_action} not supported") + op = ops.func_to_op(func) + # TODO(shobs): Support **kwargs - return self._apply_unary_op( - ops.RemoteFunctionOp( - function_def=func.udf_def, apply_on_null=(na_action is None) - ) - ) + return self._apply_nary_op(op) def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # In Bigframes BigQuery function, DataFrame '.apply' method is specifically @@ -4770,17 +4768,11 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) # Apply the function - if args: - result_series = rows_as_json_series._apply_nary_op( - ops.NaryRemoteFunctionOp(function_def=func.udf_def), - list(args), - ) - else: - result_series = rows_as_json_series._apply_unary_op( - ops.RemoteFunctionOp( - function_def=func.udf_def, apply_on_null=True - ) - ) + result_series = rows_as_json_series._apply_nary_op( + ops.func_to_op(func), + list(args), + ) + else: # This is a special case where we are providing not-pandas-like # extension. If the bigquery function can take one or more @@ -4838,7 +4830,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): series_list = [self[col] for col in self.columns] op_list = series_list[1:] + list(args) result_series = series_list[0]._apply_nary_op( - ops.NaryRemoteFunctionOp(function_def=func.udf_def), op_list + ops.func_to_op(func), op_list ) result_series.name = None diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index b02fb665c3f7..89b52e42a319 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -20,6 +20,7 @@ import inspect import sys import threading +import time import warnings from typing import ( TYPE_CHECKING, @@ -194,14 +195,13 @@ def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): with self._artifacts_lock: self._temp_artifacts[bqrf_routine] = gcf_path - def deploy_undeployed_udf( + def _deploy_udf( self, session: Session, bq_udf: udf_def.PythonUdf, ) -> udf_def.BigqueryUdf: """Deploys a UDF to BigQuery if not already deployed.""" udf_hash = bq_udf.stable_hash() - import time bigquery_client = self._resolve_bigquery_client(session, None) bq_connection_manager = session.bqconnectionmanager diff --git a/packages/bigframes/bigframes/operations/__init__.py b/packages/bigframes/bigframes/operations/__init__.py index 473204d0c672..09d64a15c6bf 100644 --- a/packages/bigframes/bigframes/operations/__init__.py +++ b/packages/bigframes/bigframes/operations/__init__.py @@ -183,9 +183,9 @@ ) from bigframes.operations.numpy_op_maps import NUMPY_TO_BINOP, NUMPY_TO_OP from bigframes.operations.remote_function_ops import ( - BinaryRemoteFunctionOp, - NaryRemoteFunctionOp, + PythonUdfOp, RemoteFunctionOp, + func_to_op, ) from bigframes.operations.string_ops import ( EndsWithOp, @@ -375,9 +375,8 @@ "StructFieldOp", "StructOp", # Remote Functions ops - "BinaryRemoteFunctionOp", - "NaryRemoteFunctionOp", "RemoteFunctionOp", + "PythonUdfOp", # Frequency ops "DatetimeToIntegerLabelOp", "FloorDtOp", @@ -437,6 +436,8 @@ "AIIf", "AIScore", "AISimilarity", + # Helper functions + "func_to_op", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/packages/bigframes/bigframes/operations/remote_function_ops.py b/packages/bigframes/bigframes/operations/remote_function_ops.py index 9c51210df0e7..a19cd43007a0 100644 --- a/packages/bigframes/bigframes/operations/remote_function_ops.py +++ b/packages/bigframes/bigframes/operations/remote_function_ops.py @@ -19,12 +19,10 @@ from bigframes.operations import base_ops -# TODO: Enforce input type constraints from function def -@dataclasses.dataclass(frozen=True) -class RemoteFunctionOp(base_ops.UnaryOp): - name: typing.ClassVar[str] = "remote_function" - function_def: udf_def.BigqueryUdf - apply_on_null: bool +@dataclasses.dataclass +class PythonUdfOp(base_ops.NaryOp): + name: typing.ClassVar[str] = "python_udf" + function_def: udf_def.PythonUdf @property def expensive(self) -> bool: @@ -35,21 +33,8 @@ def output_type(self, *input_types): @dataclasses.dataclass(frozen=True) -class BinaryRemoteFunctionOp(base_ops.BinaryOp): - name: typing.ClassVar[str] = "binary_remote_function" - function_def: udf_def.BigqueryUdf - - @property - def expensive(self) -> bool: - return True - - def output_type(self, *input_types): - return self.function_def.signature.output.bf_type - - -@dataclasses.dataclass(frozen=True) -class NaryRemoteFunctionOp(base_ops.NaryOp): - name: typing.ClassVar[str] = "nary_remote_function" +class RemoteFunctionOp(base_ops.NaryOp): + name: typing.ClassVar[str] = "remote_function" function_def: udf_def.BigqueryUdf @property diff --git a/packages/bigframes/bigframes/operations/to_op.py b/packages/bigframes/bigframes/operations/to_op.py new file mode 100644 index 000000000000..a177ea7b423e --- /dev/null +++ b/packages/bigframes/bigframes/operations/to_op.py @@ -0,0 +1,39 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.functions.udf_def import BigqueryUdf, PythonUdf +from bigframes.operations import base_ops, remote_function_ops + + +def func_to_op(op) -> base_ops.NaryOp: + """ + Convert various bigframes, python objects into a bigframes operations. + + This should handle anything that might be passed to eg map, combine, other pandas methods that take a function. + + It should raise a TypeError if the object is not a supported type. + + Args: + op: The object to convert. + + Returns: + A bigframes operations. + """ + # TODO: Handle numpy ufuncs, builtin functions, etc. + if isinstance(op, BigqueryUdf): + return remote_function_ops.RemoteFunctionOp(function_def=op.udf_def) + elif isinstance(op, PythonUdf): + return remote_function_ops.PythonUdfOp(function_def=op.udf_def) + else: + raise TypeError(f"Unsupported function type: {op}") diff --git a/packages/bigframes/bigframes/series.py b/packages/bigframes/bigframes/series.py index a28c2f14cc9d..08bf3dc9bfa4 100644 --- a/packages/bigframes/bigframes/series.py +++ b/packages/bigframes/bigframes/series.py @@ -2043,17 +2043,10 @@ def apply( if isinstance(func, bigframes.functions.Udf): # We are working with bigquery function at this point - if args: - result_series = self._apply_nary_op( - ops.NaryRemoteFunctionOp(function_def=func.udf_def), args - ) - # TODO(jialuo): Investigate why `_apply_nary_op` drops the series - # `name`. Manually reassigning it here as a temporary fix. - result_series.name = self.name - else: - result_series = self._apply_unary_op( - ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True) - ) + result_series = self._apply_nary_op(ops.func_to_op(func), args) + # TODO(jialuo): Investigate why `_apply_nary_op` drops the series + # `name`. Manually reassigning it here as a temporary fix. + result_series.name = self.name return result_series @@ -2103,9 +2096,7 @@ def combine( ) if isinstance(func, bigframes.functions.Udf): - result_series = self._apply_binary_op( - other, ops.BinaryRemoteFunctionOp(function_def=func.udf_def) - ) + result_series = self._apply_binary_op(other, ops.func_to_op(func)) return result_series bf_op = python_ops.python_callable_to_op(func) diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index 4543b24e2512..525fccaa1681 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -513,34 +513,10 @@ def _prepare_plan_simplify(self, plan: nodes.BigFrameNode) -> nodes.BigFrameNode async def _deploy_undeployed_udfs( self, plan: nodes.BigFrameNode ) -> nodes.BigFrameNode: - import dataclasses - - import bigframes.core.expression as expression - import bigframes.functions.udf_def as udf_def - import bigframes.operations as ops - - undeployed_udfs: list[udf_def.PythonUdf] = [] - for node in plan.unique_nodes(): - for expr in node._node_expressions: - for sub_expr in expr.walk(): - if isinstance(sub_expr, expression.OpExpression): - op = sub_expr.op - if isinstance( - op, - ( - ops.RemoteFunctionOp, - ops.BinaryRemoteFunctionOp, - ops.NaryRemoteFunctionOp, - ), - ): - func_def = op.function_def - if isinstance(func_def, udf_def.PythonUdf): - undeployed_udfs.append(func_def) - + undeployed_udfs = self._collect_udf_defs(plan) if not undeployed_udfs: return plan - # Deduplicate while preserving order seen = set() unique_undeployed_udfs = [] for udf in undeployed_udfs: @@ -552,30 +528,49 @@ async def _deploy_undeployed_udfs( deployed_mapping: dict[udf_def.PythonUdf, udf_def.BigqueryUdf] = {} for udf in unique_undeployed_udfs: deployed_udf = await asyncio.to_thread( - session._function_session.deploy_undeployed_udf, + session._function_session._deploy_udf, session, udf, ) deployed_mapping[udf] = deployed_udf + return self._subsitute_temporary_functions(plan, deployed_mapping) + + def _collect_udf_defs(self, plan: nodes.BigFrameNode) -> list[udf_def.PythonUdf]: + udf_defs: list[udf_def.PythonUdf] = [] + for node in plan.unique_nodes(): + for expr in node._node_expressions: + for sub_expr in expr.walk(): + if isinstance(sub_expr, expression.OpExpression): + op = sub_expr.op + if isinstance( + op, + (ops.PythonUdfOp,), + ): + func_def = op.function_def + if isinstance(func_def, udf_def.PythonUdf): + udf_defs.append(func_def) + return udf_defs + + def _subsitute_temporary_functions( + self, + plan: nodes.BigFrameNode, + deployed_mapping: dict[udf_def.PythonUdf, udf_def.BigqueryUdf], + ) -> nodes.BigFrameNode: # Now rewrite the plan using bottom_up to substitute the UDF definitions! def replace_in_expr(expr: expression.Expression) -> expression.Expression: def replace_step(e: expression.Expression) -> expression.Expression: if isinstance(e, expression.OpExpression): op = e.op - if isinstance( - op, - ( - ops.RemoteFunctionOp, - ops.BinaryRemoteFunctionOp, - ops.NaryRemoteFunctionOp, - ), - ): + if isinstance(op, ops.PythonUdfOp): func_def = op.function_def if func_def in deployed_mapping: new_func_def = deployed_mapping[func_def] new_op = dataclasses.replace(op, function_def=new_func_def) return dataclasses.replace(e, op=new_op) + raise ValueError( + f"UDF definition {func_def} not found in deployed mapping" + ) return e return expr.bottom_up(replace_step) diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py index a986ca270de0..fb5a9fd7ce84 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py @@ -172,63 +172,8 @@ def test_astype_json_invalid( def test_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col"]] - function_def = udf_def.BigqueryUdf( - routine_ref=bigquery.RoutineReference.from_string( - "my_project.my_dataset.my_routine" - ), - signature=udf_def.UdfSignature( - inputs=( - udf_def.UdfArg( - "x", - udf_def.DirectScalarType(int), - ), - ), - output=udf_def.DirectScalarType(float), - ), - ) - ops_map = { - "apply_on_null_true": ops.RemoteFunctionOp( - function_def=function_def, apply_on_null=True - ).as_expr("int64_col"), - "apply_on_null_false": ops.RemoteFunctionOp( - function_def=function_def, apply_on_null=False - ).as_expr("int64_col"), - } - sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) - snapshot.assert_match(sql, "out.sql") - - -def test_binary_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "float64_col"]] - op = ops.BinaryRemoteFunctionOp( - function_def=udf_def.BigqueryUdf( - routine_ref=bigquery.RoutineReference.from_string( - "my_project.my_dataset.my_routine" - ), - signature=udf_def.UdfSignature( - inputs=( - udf_def.UdfArg( - "x", - udf_def.DirectScalarType(int), - ), - udf_def.UdfArg( - "y", - udf_def.DirectScalarType(float), - ), - ), - output=udf_def.DirectScalarType(float), - ), - ) - ) - sql = utils._apply_binary_op(bf_df, op, "int64_col", "float64_col") - - snapshot.assert_match(sql, "out.sql") - - -def test_nary_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "float64_col", "string_col"]] - op = ops.NaryRemoteFunctionOp( + op = ops.RemoteFunctionOp( function_def=udf_def.BigqueryUdf( routine_ref=bigquery.RoutineReference.from_string( "my_project.my_dataset.my_routine" diff --git a/packages/bigframes/tests/unit/functions/test_remote_function.py b/packages/bigframes/tests/unit/functions/test_remote_function.py index 4490bf36caae..17c04b338385 100644 --- a/packages/bigframes/tests/unit/functions/test_remote_function.py +++ b/packages/bigframes/tests/unit/functions/test_remote_function.py @@ -75,74 +75,3 @@ def my_remote_func(x: int) -> int: # Test that the function would have been deployed somewhere. assert "my_custom_name" in deployed.bigframes_bigquery_function - - -def test_deferred_udf_execution(): - import google.cloud.bigquery - - import bigframes.functions.udf_def as udf_def - - session = mocks.create_bigquery_session() - - @session._function_session.udf(session=session) - def my_unnamed_udf(x: int) -> int: - return x * 2 - - # 1. Verify that no BQ query was executed to deploy the UDF during registration! - session._queries.clear() - assert len(session._queries) == 0 - - # 2. Verify that it created a PythonUdf - assert isinstance(my_unnamed_udf.udf_def, udf_def.PythonUdf) - - # 3. Verify that when calling the UDF via a query, it triggers the UDF deployment query! - import bigframes.core.expression as ex - import bigframes.core.nodes as nodes - import bigframes.operations as ops - - # Let's construct an expression using our UDF - udf_op = ops.RemoteFunctionOp( - function_def=my_unnamed_udf.udf_def, apply_on_null=False - ) - expr = ex.OpExpression(op=udf_op, inputs=(ex.const(5),)) - - class MockNode: - def __init__(self, exprs): - self._node_expressions = exprs - self.child_nodes = [] - - def unique_nodes(self): - yield self - - def bottom_up(self, transform): - return transform(self) - - def transform_exprs(self, fn): - return MockNode([fn(e) for e in self._node_expressions]) - - mock_node = MockNode([expr]) - - import asyncio - - # Deploy and replace definition in the plan - new_plan = asyncio.run( - session._executor._ibis_executor._deploy_undeployed_udfs(mock_node) - ) - - # Verify that the DDL to create the function was executed! - assert len(session._queries) > 0 - assert any("CREATE OR REPLACE FUNCTION" in q for q in session._queries) - - # 4. Verify that the definition in the plan has been replaced with BigqueryUdf - new_expr = new_plan._node_expressions[0] - new_op = new_expr.op - assert isinstance(new_op.function_def, udf_def.BigqueryUdf) - assert new_op.function_def.routine_ref is not None - - # 5. Verify memoization: Deploying the new plan again executes ZERO additional DDL queries! - session._queries.clear() - new_plan_2 = asyncio.run( - session._executor._ibis_executor._deploy_undeployed_udfs(new_plan) - ) - assert len(session._queries) == 0 - assert new_plan_2 == new_plan From 24a4865edab55fabb7ea40a4844c4aa8bfed198d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 27 May 2026 21:17:49 +0000 Subject: [PATCH 04/21] fixes --- packages/bigframes/bigframes/core/blocks.py | 4 ++-- .../core/compile/ibis_compiler/scalar_op_registry.py | 7 +++---- .../core/compile/sqlglot/expressions/generic_ops.py | 7 +++---- packages/bigframes/bigframes/dataframe.py | 2 +- packages/bigframes/bigframes/operations/__init__.py | 2 +- .../bigframes/operations/remote_function_ops.py | 2 +- packages/bigframes/bigframes/operations/to_op.py | 10 ++++++---- .../bigframes/session/bq_caching_executor.py | 11 ++++++++++- 8 files changed, 27 insertions(+), 18 deletions(-) diff --git a/packages/bigframes/bigframes/core/blocks.py b/packages/bigframes/bigframes/core/blocks.py index b9a246fc0360..6506b56a8bf8 100644 --- a/packages/bigframes/bigframes/core/blocks.py +++ b/packages/bigframes/bigframes/core/blocks.py @@ -1091,9 +1091,9 @@ def multi_apply_window_op( def multi_apply_unary_op( self, - op: Union[ops.UnaryOp, ex.Expression], + op: Union[ops.UnaryOp, ops.NaryOp, ex.Expression], ) -> Block: - if isinstance(op, ops.UnaryOp): + if isinstance(op, (ops.UnaryOp, ops.NaryOp)): input_varname = guid.generate_guid() expr = op.as_expr(ex.free_var(input_varname)) else: diff --git a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index b5b1e515cefa..5172d1e7c602 100644 --- a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1034,8 +1034,8 @@ def timedelta_floor_op_impl(x: ibis_types.NumericValue): return ibis_api.case().when(x > ibis.literal(0), x.floor()).else_(x.ceil()).end() -@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) -def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): +@scalar_op_compiler.register_nary_op(ops.RemoteFunctionOp, pass_op=True) +def remote_function_op_impl(*values: ibis_types.Value, op: ops.RemoteFunctionOp): udf_sig = op.function_def.signature assert not udf_sig.is_virtual # should have been devirtualized in lowering pass ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) @@ -1048,8 +1048,7 @@ def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): ) def udf(*inputs): ... - result = udf(*operands) - return result + return udf(*values) @scalar_op_compiler.register_unary_op(ops.MapOp, pass_op=True) diff --git a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py index 292bad714098..22dcd8bf51ac 100644 --- a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py +++ b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py @@ -186,10 +186,9 @@ def _get_remote_function_name(op): ) -@register_unary_op(ops.RemoteFunctionOp, pass_op=True) -def _(expr: TypedExpr, op: ops.RemoteFunctionOp) -> sge.Expression: - func_name = _get_remote_function_name(op) - return sge.func(func_name, *(operand.expr for operand in operands)) +@register_nary_op(ops.RemoteFunctionOp, pass_op=True) +def _(*values: TypedExpr, op: ops.RemoteFunctionOp) -> sge.Expression: + return sge.func(_get_remote_function_name(op), *(value.expr for value in values)) @register_nary_op(ops.case_when_op) diff --git a/packages/bigframes/bigframes/dataframe.py b/packages/bigframes/bigframes/dataframe.py index 944c70fbf828..ff3142cb1510 100644 --- a/packages/bigframes/bigframes/dataframe.py +++ b/packages/bigframes/bigframes/dataframe.py @@ -4697,7 +4697,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: op = ops.func_to_op(func) # TODO(shobs): Support **kwargs - return self._apply_nary_op(op) + return self._apply_unary_op(op) def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # In Bigframes BigQuery function, DataFrame '.apply' method is specifically diff --git a/packages/bigframes/bigframes/operations/__init__.py b/packages/bigframes/bigframes/operations/__init__.py index 09d64a15c6bf..b8d860029a0f 100644 --- a/packages/bigframes/bigframes/operations/__init__.py +++ b/packages/bigframes/bigframes/operations/__init__.py @@ -185,7 +185,6 @@ from bigframes.operations.remote_function_ops import ( PythonUdfOp, RemoteFunctionOp, - func_to_op, ) from bigframes.operations.string_ops import ( EndsWithOp, @@ -230,6 +229,7 @@ timestamp_add_op, timestamp_sub_op, ) +from bigframes.operations.to_op import func_to_op __all__ = [ # Base ops diff --git a/packages/bigframes/bigframes/operations/remote_function_ops.py b/packages/bigframes/bigframes/operations/remote_function_ops.py index a19cd43007a0..3ce77d51c615 100644 --- a/packages/bigframes/bigframes/operations/remote_function_ops.py +++ b/packages/bigframes/bigframes/operations/remote_function_ops.py @@ -19,7 +19,7 @@ from bigframes.operations import base_ops -@dataclasses.dataclass +@dataclasses.dataclass(frozen=True) class PythonUdfOp(base_ops.NaryOp): name: typing.ClassVar[str] = "python_udf" function_def: udf_def.PythonUdf diff --git a/packages/bigframes/bigframes/operations/to_op.py b/packages/bigframes/bigframes/operations/to_op.py index a177ea7b423e..c139541470d1 100644 --- a/packages/bigframes/bigframes/operations/to_op.py +++ b/packages/bigframes/bigframes/operations/to_op.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from bigframes.functions import Udf from bigframes.functions.udf_def import BigqueryUdf, PythonUdf from bigframes.operations import base_ops, remote_function_ops @@ -31,9 +32,10 @@ def func_to_op(op) -> base_ops.NaryOp: A bigframes operations. """ # TODO: Handle numpy ufuncs, builtin functions, etc. - if isinstance(op, BigqueryUdf): - return remote_function_ops.RemoteFunctionOp(function_def=op.udf_def) - elif isinstance(op, PythonUdf): - return remote_function_ops.PythonUdfOp(function_def=op.udf_def) + if isinstance(op, Udf): + if isinstance(op.udf_def, BigqueryUdf): + return remote_function_ops.RemoteFunctionOp(function_def=op.udf_def) + elif isinstance(op.udf_def, PythonUdf): + return remote_function_ops.PythonUdfOp(function_def=op.udf_def) else: raise TypeError(f"Unsupported function type: {op}") diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index 525fccaa1681..ba841bf8dcd6 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -35,13 +35,22 @@ import bigframes.core.schema as schemata import bigframes.core.tree_properties as tree_properties import bigframes.dtypes +import bigframes.operations as ops import bigframes.session._io.bigquery as bq_io import bigframes.session.execution_cache as execution_cache import bigframes.session.execution_spec as ex_spec import bigframes.session.metrics import bigframes.session.planner import bigframes.session.temporary_storage -from bigframes.core import compile, guid, identifiers, local_data, rewrite +from bigframes._config import ComputeOptions +from bigframes.core import ( + compile, + expression, + guid, + identifiers, + local_data, + rewrite, +) from bigframes.core.compile.sqlglot import sql as sg_sql from bigframes.core.compile.sqlglot import sqlglot_ir from bigframes.session import ( From a41370a0a961d42a51d6068ee3baa49215d057db Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 May 2026 01:48:49 +0000 Subject: [PATCH 05/21] fixes --- packages/bigframes/bigframes/core/blocks.py | 2 +- packages/bigframes/bigframes/dataframe.py | 10 ++++-- .../bigframes/functions/_function_session.py | 32 +++++++++---------- packages/bigframes/bigframes/series.py | 6 +++- .../bigframes/session/bq_caching_executor.py | 27 ++++------------ .../test_remote_function_op/out.sql | 7 +--- 6 files changed, 36 insertions(+), 48 deletions(-) diff --git a/packages/bigframes/bigframes/core/blocks.py b/packages/bigframes/bigframes/core/blocks.py index 6506b56a8bf8..33f5aaab5c7d 100644 --- a/packages/bigframes/bigframes/core/blocks.py +++ b/packages/bigframes/bigframes/core/blocks.py @@ -1098,7 +1098,7 @@ def multi_apply_unary_op( expr = op.as_expr(ex.free_var(input_varname)) else: input_varnames = op.free_variables - assert len(input_varnames) == 1 + assert len(set(input_varnames)) == 1 expr = op input_varname = input_varnames[0] diff --git a/packages/bigframes/bigframes/dataframe.py b/packages/bigframes/bigframes/dataframe.py index ff3142cb1510..cc80dd0af314 100644 --- a/packages/bigframes/bigframes/dataframe.py +++ b/packages/bigframes/bigframes/dataframe.py @@ -4694,10 +4694,14 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: if na_action not in {None, "ignore"}: raise ValueError(f"na_action={na_action} not supported") - op = ops.func_to_op(func) + expr = ops.func_to_op(func).as_expr(ex.free_var("input")) + if na_action == "ignore": + # True case, predicate, False case + expr = ops.where_op.as_expr( + expr, ops.notnull_op.as_expr(ex.free_var("input")), ex.const(None) + ) - # TODO(shobs): Support **kwargs - return self._apply_unary_op(op) + return DataFrame(self._block.multi_apply_unary_op(expr)) def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # In Bigframes BigQuery function, DataFrame '.apply' method is specifically diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 89b52e42a319..87838246b0d2 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -757,8 +757,6 @@ def udf( max_batching_rows: Optional[int] = None, container_cpu: Optional[float] = None, container_memory: Optional[str] = None, - *, - _force_deploy: bool = False, ): """Decorator to turn a Python user defined function (udf) into a BigQuery managed function. @@ -922,6 +920,9 @@ def wrapper(func): max_batching_rows=max_batching_rows, packages=tuple(packages) if packages else (), ) + if udf_sig.is_row_processor: + msg = bfe.format_message("input_types=Series is in preview.") + warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) if ( not name and not _force_deploy @@ -931,7 +932,8 @@ def wrapper(func): code=code_def, requirements=requirements, ) - else: + return bq_functions.UdfRoutine(func=func, _udf_def=udf_definition) + else: # deploy immediately bq_function_name = ( managed_function_client.provision_bq_managed_function( name=name, @@ -947,21 +949,17 @@ def wrapper(func): routine_ref=bigquery.RoutineReference.from_string(full_rf_name), signature=udf_sig, ) - - if udf_sig.is_row_processor: - msg = bfe.format_message("input_types=Series is in preview.") - warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) - - if not name: # session-owned resource - will be cleaned up automatically - if _force_deploy: + if name is None: + # Null name means anonymous, session-owned resource with force deploy. + # Unnamed resources are owned by the session and will be cleaned up automatically. self._update_temp_artifacts(full_rf_name, "") - return bq_functions.UdfRoutine(func=func, _udf_def=udf_definition) - - # user-managed permanent resource - will not be cleaned up automatically - else: - return bq_functions.BigqueryCallableRoutine( - udf_definition, session, local_func=func, is_managed=True - ) + return bq_functions.UdfRoutine(func=func, _udf_def=udf_definition) + else: + # user-managed permanent resource - will not be cleaned up automatically + # provide richer handle for backwards compatibility + return bq_functions.BigqueryCallableRoutine( + udf_definition, session, local_func=func, is_managed=True + ) return wrapper diff --git a/packages/bigframes/bigframes/series.py b/packages/bigframes/bigframes/series.py index 08bf3dc9bfa4..0091d0a34b6c 100644 --- a/packages/bigframes/bigframes/series.py +++ b/packages/bigframes/bigframes/series.py @@ -2096,7 +2096,11 @@ def combine( ) if isinstance(func, bigframes.functions.Udf): - result_series = self._apply_binary_op(other, ops.func_to_op(func)) + result_series = self._apply_nary_op(ops.func_to_op(func), (other,)) + if hasattr(other, "name") and other.name != self._name: # type: ignore + result_series.name = None + else: + result_series.name = self.name return result_series bf_op = python_ops.python_callable_to_op(func) diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index ba841bf8dcd6..e5b9039ec2a0 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -53,6 +53,7 @@ ) from bigframes.core.compile.sqlglot import sql as sg_sql from bigframes.core.compile.sqlglot import sqlglot_ir +from bigframes.functions import udf_def from bigframes.session import ( direct_gbq_execution, executor, @@ -522,20 +523,10 @@ def _prepare_plan_simplify(self, plan: nodes.BigFrameNode) -> nodes.BigFrameNode async def _deploy_undeployed_udfs( self, plan: nodes.BigFrameNode ) -> nodes.BigFrameNode: - undeployed_udfs = self._collect_udf_defs(plan) - if not undeployed_udfs: - return plan - - seen = set() - unique_undeployed_udfs = [] - for udf in undeployed_udfs: - if udf not in seen: - seen.add(udf) - unique_undeployed_udfs.append(udf) - + referenced_udfs = self._collect_udf_defs(plan) session = self.loader._session deployed_mapping: dict[udf_def.PythonUdf, udf_def.BigqueryUdf] = {} - for udf in unique_undeployed_udfs: + for udf in set(referenced_udfs): deployed_udf = await asyncio.to_thread( session._function_session._deploy_udf, session, @@ -552,10 +543,7 @@ def _collect_udf_defs(self, plan: nodes.BigFrameNode) -> list[udf_def.PythonUdf] for sub_expr in expr.walk(): if isinstance(sub_expr, expression.OpExpression): op = sub_expr.op - if isinstance( - op, - (ops.PythonUdfOp,), - ): + if isinstance(op, ops.PythonUdfOp): func_def = op.function_def if isinstance(func_def, udf_def.PythonUdf): udf_defs.append(func_def) @@ -566,7 +554,6 @@ def _subsitute_temporary_functions( plan: nodes.BigFrameNode, deployed_mapping: dict[udf_def.PythonUdf, udf_def.BigqueryUdf], ) -> nodes.BigFrameNode: - # Now rewrite the plan using bottom_up to substitute the UDF definitions! def replace_in_expr(expr: expression.Expression) -> expression.Expression: def replace_step(e: expression.Expression) -> expression.Expression: if isinstance(e, expression.OpExpression): @@ -574,9 +561,9 @@ def replace_step(e: expression.Expression) -> expression.Expression: if isinstance(op, ops.PythonUdfOp): func_def = op.function_def if func_def in deployed_mapping: - new_func_def = deployed_mapping[func_def] - new_op = dataclasses.replace(op, function_def=new_func_def) - return dataclasses.replace(e, op=new_op) + deployed_func = deployed_mapping[func_def] + rf_op = ops.RemoteFunctionOp(function_def=deployed_func) + return dataclasses.replace(e, op=rf_op) raise ValueError( f"UDF definition {func_def} not found in deployed mapping" ) diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_remote_function_op/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_remote_function_op/out.sql index 1854c0258825..a1977d809f70 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_remote_function_op/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_remote_function_op/out.sql @@ -1,8 +1,3 @@ SELECT - `my_project`.`my_dataset`.`my_routine`(`int64_col`) AS `apply_on_null_true`, - IF( - `int64_col` IS NULL, - `int64_col`, - `my_project`.`my_dataset`.`my_routine`(`int64_col`) - ) AS `apply_on_null_false` + `my_project`.`my_dataset`.`my_routine`(`int64_col`, `float64_col`, `string_col`) AS `int64_col` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file From 8195742a472c304a8e70f1d2fea73853a7eac106 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 May 2026 01:57:21 +0000 Subject: [PATCH 06/21] fix _force_deploy flag --- packages/bigframes/bigframes/functions/_function_session.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 87838246b0d2..2b241ad266c5 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -757,6 +757,8 @@ def udf( max_batching_rows: Optional[int] = None, container_cpu: Optional[float] = None, container_memory: Optional[str] = None, + *, + _force_deploy: bool = False, ): """Decorator to turn a Python user defined function (udf) into a BigQuery managed function. From c4d77280d41f25ff764b606a13da0b98357cf959 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 May 2026 02:05:46 +0000 Subject: [PATCH 07/21] executor owns function manager --- .../bigframes/session/bq_caching_executor.py | 17 ++++++++++++----- .../bigframes/session/proxy_executor.py | 6 ++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index e5b9039ec2a0..5989f52c6ad7 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -62,6 +62,8 @@ read_api_execution, semi_executor, ) +import bigframes.functions._function_session as bff_session + # Max complexity that should be executed as a single query QUERY_COMPLEXITY_LIMIT = 1e7 @@ -130,6 +132,7 @@ def __init__( labels: tuple[tuple[str, str], ...] = (), compiler_name: Literal["ibis", "sqlglot"] = "sqlglot", cache: Optional[execution_cache.ExecutionCache] = None, + function_manager: bff_session.FunctionSession, ): self.bqclient = bqclient self.storage_manager = storage_manager @@ -165,6 +168,7 @@ def __init__( publisher=self._publisher, labels=dict(labels), ) + self._function_manager = function_manager def to_sql( self, @@ -523,16 +527,19 @@ def _prepare_plan_simplify(self, plan: nodes.BigFrameNode) -> nodes.BigFrameNode async def _deploy_undeployed_udfs( self, plan: nodes.BigFrameNode ) -> nodes.BigFrameNode: - referenced_udfs = self._collect_udf_defs(plan) + referenced_udfs = list(set(self._collect_udf_defs(plan))) session = self.loader._session deployed_mapping: dict[udf_def.PythonUdf, udf_def.BigqueryUdf] = {} - for udf in set(referenced_udfs): - deployed_udf = await asyncio.to_thread( - session._function_session._deploy_udf, + tasks = [ + asyncio.to_thread( + self._function_manager._deploy_udf, session, udf, ) - deployed_mapping[udf] = deployed_udf + for udf in referenced_udfs + ] + results = await asyncio.gather(*tasks) + deployed_mapping = dict(zip(referenced_udfs, results)) return self._subsitute_temporary_functions(plan, deployed_mapping) diff --git a/packages/bigframes/bigframes/session/proxy_executor.py b/packages/bigframes/bigframes/session/proxy_executor.py index 8f673a8bd1f4..7fd19afcede1 100644 --- a/packages/bigframes/bigframes/session/proxy_executor.py +++ b/packages/bigframes/bigframes/session/proxy_executor.py @@ -31,6 +31,8 @@ loader, temporary_storage, ) +import bigframes.functions._function_session as bff_session + _COMPILER_LABEL_KEY = "bigframes-compiler" @@ -50,6 +52,8 @@ def __init__( metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, enable_polars_execution: bool = False, publisher: bigframes.core.events.Publisher, + function_manager: bff_session.FunctionSession, + labels: tuple[tuple[str, str], ...] = (), ): self._enable_polars_execution = enable_polars_execution @@ -65,6 +69,7 @@ def __init__( labels=labels, cache=shared_cache, compiler_name="ibis", + function_manager=function_manager, ) self._sqlglot_executor = bq_caching_executor.BigQueryCachingExecutor( bqclient, @@ -77,6 +82,7 @@ def __init__( labels=labels, cache=shared_cache, compiler_name="sqlglot", + function_manager=function_manager, ) def to_sql( From 0f874b170c9307b75b60a4764755599f93ed24a3 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 May 2026 02:06:01 +0000 Subject: [PATCH 08/21] lint --- packages/bigframes/bigframes/session/bq_caching_executor.py | 3 +-- packages/bigframes/bigframes/session/proxy_executor.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index 5989f52c6ad7..c8f6b45e00e8 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -35,6 +35,7 @@ import bigframes.core.schema as schemata import bigframes.core.tree_properties as tree_properties import bigframes.dtypes +import bigframes.functions._function_session as bff_session import bigframes.operations as ops import bigframes.session._io.bigquery as bq_io import bigframes.session.execution_cache as execution_cache @@ -62,8 +63,6 @@ read_api_execution, semi_executor, ) -import bigframes.functions._function_session as bff_session - # Max complexity that should be executed as a single query QUERY_COMPLEXITY_LIMIT = 1e7 diff --git a/packages/bigframes/bigframes/session/proxy_executor.py b/packages/bigframes/bigframes/session/proxy_executor.py index 7fd19afcede1..e05e02b611e1 100644 --- a/packages/bigframes/bigframes/session/proxy_executor.py +++ b/packages/bigframes/bigframes/session/proxy_executor.py @@ -22,6 +22,7 @@ import google.cloud.exceptions import bigframes.core +import bigframes.functions._function_session as bff_session from bigframes import exceptions as bfe from bigframes.session import ( bq_caching_executor, @@ -31,8 +32,6 @@ loader, temporary_storage, ) -import bigframes.functions._function_session as bff_session - _COMPILER_LABEL_KEY = "bigframes-compiler" @@ -53,7 +52,6 @@ def __init__( enable_polars_execution: bool = False, publisher: bigframes.core.events.Publisher, function_manager: bff_session.FunctionSession, - labels: tuple[tuple[str, str], ...] = (), ): self._enable_polars_execution = enable_polars_execution From 32be2a0a3ce0fad3038b479cee71fe43ba8fac18 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 May 2026 02:08:42 +0000 Subject: [PATCH 09/21] always give rich ref for deployed funcs --- .../bigframes/functions/_function_session.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 2b241ad266c5..1c6aa4b5e24a 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -955,13 +955,10 @@ def wrapper(func): # Null name means anonymous, session-owned resource with force deploy. # Unnamed resources are owned by the session and will be cleaned up automatically. self._update_temp_artifacts(full_rf_name, "") - return bq_functions.UdfRoutine(func=func, _udf_def=udf_definition) - else: - # user-managed permanent resource - will not be cleaned up automatically - # provide richer handle for backwards compatibility - return bq_functions.BigqueryCallableRoutine( - udf_definition, session, local_func=func, is_managed=True - ) + + return bq_functions.BigqueryCallableRoutine( + udf_definition, session, local_func=func, is_managed=True + ) return wrapper From 2ee86a51e44caa4216e89f15cf81728d669da042 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 May 2026 17:54:41 +0000 Subject: [PATCH 10/21] fixes --- .../bigframes/bigframes/functions/_function_session.py | 4 ++-- packages/bigframes/bigframes/session/__init__.py | 1 + .../bigframes/tests/unit/session/test_proxy_executor.py | 8 +++++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 1c6aa4b5e24a..281785fbea67 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -947,7 +947,7 @@ def wrapper(func): bq_function_name ) ) - udf_definition = udf_def.BigqueryUdf( + rf_def = udf_def.BigqueryUdf( routine_ref=bigquery.RoutineReference.from_string(full_rf_name), signature=udf_sig, ) @@ -957,7 +957,7 @@ def wrapper(func): self._update_temp_artifacts(full_rf_name, "") return bq_functions.BigqueryCallableRoutine( - udf_definition, session, local_func=func, is_managed=True + rf_def, session, local_func=func, is_managed=True ) return wrapper diff --git a/packages/bigframes/bigframes/session/__init__.py b/packages/bigframes/bigframes/session/__init__.py index 92e032bc31e4..d1bdc3854e46 100644 --- a/packages/bigframes/bigframes/session/__init__.py +++ b/packages/bigframes/bigframes/session/__init__.py @@ -338,6 +338,7 @@ def __init__( enable_polars_execution=context.enable_polars_execution, publisher=self._publisher, labels=tuple(labels.items()), + function_manager=self._function_session, ) def __del__(self): diff --git a/packages/bigframes/tests/unit/session/test_proxy_executor.py b/packages/bigframes/tests/unit/session/test_proxy_executor.py index 8e3760cdb035..5a95f212a553 100644 --- a/packages/bigframes/tests/unit/session/test_proxy_executor.py +++ b/packages/bigframes/tests/unit/session/test_proxy_executor.py @@ -30,8 +30,14 @@ def mock_executor(): bqstoragereadclient = mock.Mock() loader = mock.Mock() publisher = mock.Mock() + function_manager = mock.Mock() return DualCompilerProxyExecutor( - bqclient, storage_manager, bqstoragereadclient, loader, publisher=publisher + bqclient, + storage_manager, + bqstoragereadclient, + loader, + publisher=publisher, + function_manager=function_manager, ) From 181a78d4b04258b3d76d910735f9f230791ea46a Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 27 May 2026 04:45:19 +0000 Subject: [PATCH 11/21] refactor things a bit --- .../bigframes/functions/_function_client.py | 343 +++------- .../bigframes/functions/_function_session.py | 591 ++++++++---------- .../bigframes/bigframes/functions/_utils.py | 20 +- .../bigframes/bigframes/functions/function.py | 31 +- .../bigframes/bigframes/functions/udf_def.py | 8 + .../bigframes/bigframes/session/__init__.py | 43 +- .../session/_io/bigquery/__init__.py | 4 +- .../bigframes/session/bq_caching_executor.py | 1 - packages/bigframes/bigframes/testing/mocks.py | 1 + packages/bigframes/bigframes/testing/utils.py | 2 +- .../unit/functions/test_remote_function.py | 8 +- .../functions/test_remote_function_utils.py | 20 +- 12 files changed, 423 insertions(+), 649 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_client.py b/packages/bigframes/bigframes/functions/_function_client.py index 8ccc175cf122..67ae8e3f7605 100644 --- a/packages/bigframes/bigframes/functions/_function_client.py +++ b/packages/bigframes/bigframes/functions/_function_client.py @@ -42,11 +42,6 @@ from google.cloud import bigquery, functions_v2 from bigframes.functions import _utils -from bigframes.functions._utils import ( - _BIGFRAMES_FUNCTION_PREFIX, - _BQ_FUNCTION_NAME_SEPERATOR, - _GCF_FUNCTION_NAME_SEPERATOR, -) logger = logging.getLogger(__name__) @@ -71,8 +66,6 @@ # BQ managed functions (@udf) currently only support Python 3.11. _MANAGED_FUNC_PYTHON_VERSION = "python-3.11" -_DEFAULT_FUNCTION_MEMORY_MIB = 1024 - class FunctionClient: # Wait time (in seconds) for an IAM binding to take effect after creation. @@ -82,52 +75,42 @@ class FunctionClient: # deployment into method parameters. def __init__( self, - gcp_project_id, - bq_location, - bq_dataset, - bq_client, - bq_connection_manager, - cloud_function_region=None, - cloud_functions_client=None, - cloud_function_service_account=None, - cloud_function_kms_key_name=None, - cloud_function_docker_repository=None, - cloud_build_service_account=None, - *, - session: Session, + gcp_project_id: str, + bq_location: str, + bq_client: bigquery.Client, + bq_connection_manager: bigquery_connection.BigQueryConnectionManager, + cloud_functions_client: functions_v2.FunctionServiceClient, + publisher, ): self._gcp_project_id = gcp_project_id self._bq_location = bq_location - self._bq_dataset = bq_dataset self._bq_client = bq_client self._bq_connection_manager = bq_connection_manager - self._session = session - - # Optional attributes only for remote functions. - self._cloud_function_region = cloud_function_region + self._publisher = publisher self._cloud_functions_client = cloud_functions_client - self._cloud_function_service_account = cloud_function_service_account - self._cloud_function_kms_key_name = cloud_function_kms_key_name - self._cloud_function_docker_repository = cloud_function_docker_repository - self._cloud_build_service_account = cloud_build_service_account - - def _create_bq_connection(self, connection_id: str) -> None: - if self._bq_connection_manager: - self._bq_connection_manager.create_bq_connection( - self._gcp_project_id, - self._bq_location, - connection_id, - "run.invoker", - ) - def _ensure_dataset_exists(self) -> None: + self._cf_location = _utils.gcf_location_from_bq_location(bq_location) + + @property + def cloudfunctions_region(self) -> str: + return self._cf_location + + def _create_bq_connection( + self, + connection_id: str, + bq_project_id: str, + ) -> None: + self._bq_connection_manager.create_bq_connection( + bq_project_id, + self._bq_location, + connection_id, + "run.invoker", + ) + + def _ensure_dataset_exists(self, dataset_ref: bigquery.DatasetReference) -> None: # Make sure the dataset exists, i.e. if it doesn't exist, go ahead and # create it. - dataset = bigquery.Dataset( - bigquery.DatasetReference.from_string( - self._bq_dataset, default_project=self._gcp_project_id - ) - ) + dataset = bigquery.Dataset(dataset_ref) dataset.location = self._bq_location try: # This check does not require bigquery.datasets.create IAM @@ -143,14 +126,14 @@ def _create_bq_function(self, create_function_ddl: str) -> None: import bigframes.session._io.bigquery _, query_job = bigframes.session._io.bigquery.start_query_with_job( - cast(bigquery.Client, self._session.bqclient), + self._bq_client, create_function_ddl, job_config=bigquery.QueryJobConfig(), - location=None, + location=self._bq_location, project=None, timeout=None, metrics=None, - publisher=self._session._publisher, + publisher=self._publisher, ) logger.info(f"Created bigframes function {query_job.ddl_target_routine}") @@ -172,12 +155,20 @@ def format_val(val): def create_bq_remote_function( self, - name: str, + routine_ref: bigquery.RoutineReference, udf_def: udf_def.RemoteFunctionConfig, + maybe_reuse: bool, ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" - self._create_bq_connection(udf_def.connection_id) + + if maybe_reuse: + existing_rf_spec = self.get_remote_function_specs(routine_ref) + if existing_rf_spec and existing_rf_spec == udf_def: + logger.info(f"Remote function {str(routine_ref)} already exists.") + return + + self._create_bq_connection(udf_def.connection_id, routine_ref.project) # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 @@ -200,22 +191,24 @@ def create_bq_remote_function( import bigframes.core.utils # removes anything that isn't letter, number or underscore - _validate_routine_name(name) - bq_function_name_escaped = bigframes.core.sql.identifier(name) + _validate_routine_name(routine_ref.routine_id) + bq_function_name_escaped = bigframes.core.sql.identifier(routine_ref.routine_id) create_function_ddl = f""" - CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name_escaped}({udf_def.signature.to_sql_input_signature()}) + CREATE OR REPLACE FUNCTION `{routine_ref.project}.{routine_ref.dataset_id}.{bq_function_name_escaped}`({udf_def.signature.to_sql_input_signature()}) RETURNS {udf_def.signature.with_devirtualize().output.sql_type} - REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{udf_def.connection_id}` + REMOTE WITH CONNECTION `{routine_ref.project}.{self._bq_location}.{udf_def.connection_id}` OPTIONS ({remote_function_options_str})""" logger.info(f"Creating BQ remote function: {create_function_ddl}") - self._ensure_dataset_exists() + self._ensure_dataset_exists( + bigquery.DatasetReference(routine_ref.project, routine_ref.dataset_id) + ) self._create_bq_function(create_function_ddl) def provision_bq_managed_function( self, - name: Optional[str], + routine_ref: bigquery.RoutineReference, config: udf_def.ManagedFunctionConfig, ): """Create a BigQuery managed function.""" @@ -261,20 +254,13 @@ def provision_bq_managed_function( managed_function_options ) - bq_function_name = name - if not bq_function_name: - # Compute a unique hash representing the artifact definition. - bq_function_name = get_managed_function_name( - config, self._session.session_id - ) - persistent_func_id = ( - f"`{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}" + f"`{routine_ref.project}.{routine_ref.dataset_id}.{routine_ref.routine_id}`" ) with_connection_clause = ( ( - f"WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{config.bq_connection_id}`" + f"WITH CONNECTION `{routine_ref.project}.{self._bq_location}.{config.bq_connection_id}`" ) if config.bq_connection_id else "" @@ -304,27 +290,23 @@ def provision_bq_managed_function( .replace("__UDF_PLACE_HOLDER__", python_code_block) ) - self._ensure_dataset_exists() + self._ensure_dataset_exists( + bigquery.DatasetReference(routine_ref.project, routine_ref.dataset_id) + ) self._create_bq_function(create_function_ddl) - return bq_function_name - def get_cloud_function_fully_qualified_parent(self): "Get the fully qualilfied parent for a cloud function." return self._cloud_functions_client.common_location_path( - self._gcp_project_id, self._cloud_function_region + self._gcp_project_id, self._cf_location ) def get_cloud_function_fully_qualified_name(self, name): "Get the fully qualilfied name for a cloud function." return self._cloud_functions_client.function_path( - self._gcp_project_id, self._cloud_function_region, name + self._gcp_project_id, self._cf_location, name ) - def get_remote_function_fully_qualilfied_name(self, name): - "Get the fully qualilfied name for a BQ remote function." - return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" - def get_cloud_function_endpoint(self, name) -> str | None: """Get the http endpoint of a cloud function if it exists.""" fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) @@ -337,7 +319,7 @@ def get_cloud_function_endpoint(self, name) -> str | None: pass return None - def generate_cloud_function_code( + def _generate_cloud_function_code( self, code_def: udf_def.CodeDef, directory, @@ -385,7 +367,7 @@ def create_cloud_function( # Build and deploy folder structure containing cloud function with tempfile.TemporaryDirectory() as directory: - entry_point = self.generate_cloud_function_code( + entry_point = self._generate_cloud_function_code( config.code, directory, udf_signature=config.signature, @@ -403,7 +385,7 @@ def create_cloud_function( # Determine an upload URL for user code upload_url_request = functions_v2.GenerateUploadUrlRequest( - kms_key_name=self._cloud_function_kms_key_name + kms_key_name=config.kms_key_name ) upload_url_request.parent = self.get_cloud_function_fully_qualified_parent() upload_url_response = self._cloud_functions_client.generate_upload_url( @@ -442,15 +424,13 @@ def create_cloud_function( function.build_config.source.storage_source.object_ = ( upload_url_response.storage_source.object_ ) - function.build_config.docker_repository = ( - self._cloud_function_docker_repository - ) + function.build_config.docker_repository = config.docker_repository - if self._cloud_build_service_account: + if config.cloud_build_service_account: canonical_cloud_build_service_account = ( - self._cloud_build_service_account - if "/" in self._cloud_build_service_account - else f"projects/{self._gcp_project_id}/serviceAccounts/{self._cloud_build_service_account}" + config.cloud_build_service_account + if "/" in config.cloud_build_service_account + else f"projects/{self._gcp_project_id}/serviceAccounts/{config.cloud_build_service_account}" ) function.build_config.service_account = ( canonical_cloud_build_service_account @@ -490,9 +470,10 @@ def create_cloud_function( functions_v2.ServiceConfig.VpcConnectorEgressSettings, _VPC_EGRESS_SETTINGS_MAP[vpc_connector_egress_settings], ) - function.service_config.service_account_email = ( - self._cloud_function_service_account - ) + if config.cloud_function_service_account: + function.service_config.service_account_email = ( + config.cloud_function_service_account + ) if config.concurrency: function.service_config.max_instance_request_concurrency = ( config.concurrency @@ -518,7 +499,8 @@ def create_cloud_function( functions_v2.ServiceConfig.IngressSettings, _INGRESS_SETTINGS_MAP[config.ingress_settings], ) - function.kms_key_name = self._cloud_function_kms_key_name + if config.kms_key_name: + function.kms_key_name = config.kms_key_name create_function_request.function = function # Create the cloud function and wait for it to be ready to use @@ -556,127 +538,13 @@ def create_cloud_function( logger.info(f"Successfully created cloud function {name} with uri ({endpoint})") return endpoint - def provision_bq_remote_function( - self, - def_, - func_signature: udf_def.UdfSignature, - reuse: bool, - name: str | None, - package_requirements: tuple[str, ...], - max_batching_rows: int | None, - cloud_function_timeout: int | None, - cloud_function_max_instance_count: int | None, - cloud_function_vpc_connector: str | None, - cloud_function_vpc_connector_egress_settings: str | None, - cloud_function_memory_mib: int | None, - cloud_function_cpus: float | None, - cloud_function_ingress_settings: str, - bq_connection_id: str, - ): - """Provision a BigQuery remote function.""" - # Augment user package requirements with any internal package - # requirements - full_package_requirements = _utils.get_updated_package_requirements( - package_requirements, func_signature.is_row_processor - ) - - if cloud_function_memory_mib is None: - cloud_function_memory_mib = _DEFAULT_FUNCTION_MEMORY_MIB - - # assumption is most bigframes functions are cpu bound, single-threaded and many won't release GIL - # therefore, want to allocate a worker for each cpu, and allow a concurrent request per worker - expected_milli_cpus = ( - int(cloud_function_cpus * 1000) - if (cloud_function_cpus is not None) - else _infer_milli_cpus_from_memory(cloud_function_memory_mib) - ) - workers = -(expected_milli_cpus // -1000) # ceil(cpus) without invoking floats - threads = 4 # (per worker) - # max concurrency==1 for vcpus < 1 hard limit from cloud run - concurrency = (workers * threads) if (expected_milli_cpus >= 1000) else 1 - - cloud_func_spec = udf_def.CloudRunFunctionConfig( - code=udf_def.CodeDef.from_func(def_, full_package_requirements), - signature=func_signature, - timeout_seconds=cloud_function_timeout, - max_instance_count=cloud_function_max_instance_count, - vpc_connector=cloud_function_vpc_connector, - vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings - or "private-ranges-only", - memory_mib=cloud_function_memory_mib, - cpus=cloud_function_cpus, - ingress_settings=cloud_function_ingress_settings, - workers=workers, - threads=threads, - concurrency=concurrency, - ) - - # If reuse of any existing function with the same name (indicated by the - # same hash of its source code and config) is not intended, then attach a unique - # suffix to the intended function name to make it unique. - random_suffix = "".join( - random.choices(string.ascii_lowercase + string.digits, k=4) - ) - # Derive the name of the cloud function underlying the intended BQ - # remote function. Use the session id to identify the GCF for unnamed - # functions. The named remote functions are treated as a persistant - # artifacts, so let's keep them independent of session id, which also - # makes their naming more stable for the same udf code - cloud_function_name = get_cloud_function_name( - cloud_func_spec, - session_id=self._session.session_id if (name is None) else None, - uniq_suffix=random_suffix if (not reuse) else None, - ) - - cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) - # Create the cloud function if it does not exist - if not cf_endpoint: - cf_endpoint = self.create_cloud_function( - cloud_function_name, cloud_func_spec - ) - else: - logger.info(f"Cloud function {cloud_function_name} already exists.") - - intended_rf_spec = udf_def.RemoteFunctionConfig( - endpoint=cf_endpoint, - connection_id=bq_connection_id, - max_batching_rows=max_batching_rows or 1000, - signature=func_signature, - bq_metadata=func_signature.protocol_metadata, - ) - remote_function_name = name or get_bigframes_function_name( - intended_rf_spec, - self._session.session_id, - random_suffix if (not reuse) else None, - ) - - if reuse: - existing_rf_spec = self.get_remote_function_specs(remote_function_name) - # Create the BQ remote function in following circumstances: - # 1. It does not exist - # 2. It exists but the existing remote function has different - # configuration than intended - created_new = False - if not existing_rf_spec or (existing_rf_spec != intended_rf_spec): - self.create_bq_remote_function(remote_function_name, intended_rf_spec) - created_new = True - else: - logger.info(f"Remote function {remote_function_name} already exists.") - - return remote_function_name, cloud_function_name, created_new - else: - self.create_bq_remote_function(remote_function_name, intended_rf_spec) - return remote_function_name, cloud_function_name, True - def get_remote_function_specs( - self, remote_function_name: str + self, remote_function_name: bigquery.RoutineReference ) -> udf_def.RemoteFunctionConfig | None: """Check whether a remote function already exists for the udf.""" try: - routine = self._bq_client.get_routine( - f"{self._gcp_project_id}.{self._bq_dataset}.{remote_function_name}" - ) - if routine.reference.routine_id == remote_function_name: + routine = self._bq_client.get_routine(str(remote_function_name)) + if routine.reference == remote_function_name: try: return udf_def.RemoteFunctionConfig.from_bq_routine(routine) except udf_def.ReturnTypeMissingError: @@ -689,40 +557,17 @@ def get_remote_function_specs( pass return None + def delete_routine(self, routine_name: bigquery.RoutineReference) -> None: + self._bq_client.delete_routine(str(routine_name), not_found_ok=True) -def get_cloud_function_name( - function_def: udf_def.CloudRunFunctionConfig, session_id=None, uniq_suffix=None -): - "Get a name for the cloud function for the given user defined function." - parts = [_BIGFRAMES_FUNCTION_PREFIX] - if session_id: - parts.append(session_id) - parts.append(function_def.stable_hash().hex()) - if uniq_suffix: - parts.append(uniq_suffix) - return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) - - -def get_bigframes_function_name( - function: udf_def.RemoteFunctionConfig, session_id, uniq_suffix=None -): - "Get a name for the bigframes function for the given user defined function." - parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function.stable_hash().hex()] - if uniq_suffix: - parts.append(uniq_suffix) - return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) - - -def get_managed_function_name( - function_def: udf_def.ManagedFunctionConfig, - session_id: str | None = None, -): - """Get a name for the bigframes managed function for the given user defined function.""" - parts = [_BIGFRAMES_FUNCTION_PREFIX] - if session_id: - parts.append(session_id) - parts.append(function_def.stable_hash().hex()) - return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + def delete_cloud_function(self, cloud_function_name: str) -> None: + try: + self._cloud_functions_client.delete_function( + name=self.get_cloud_function_fully_qualified_name(cloud_function_name) + ) + except google.api_core.exceptions.NotFound: + # The dataset might not exist, in which case the remote function doesn't, either. + pass def _validate_routine_name(name: str) -> None: @@ -733,27 +578,3 @@ def _validate_routine_name(name: str) -> None: raise ValueError( "Routine ID can contain only letters (a-z, A-Z), numbers (0-9), or underscores (_)" ) - - -def _infer_milli_cpus_from_memory(memory_mib: int) -> int: - # observed values, not formally documented by cloud run functions - if memory_mib < 128: - raise ValueError("Cloud run supports at minimum 128MiB per instance") - elif memory_mib == 128: - return 83 - elif memory_mib <= 256: - return 167 - elif memory_mib <= 512: - return 333 - elif memory_mib <= 1024: - return 583 - elif memory_mib <= 2048: - return 1000 - elif memory_mib <= 8192: - return 2000 - elif memory_mib <= 16384: - return 4000 - elif memory_mib <= 32768: - return 8000 - else: - raise ValueError("Cloud run supports at most 32768MiB per instance") diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 281785fbea67..054bcbe8cca9 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -18,6 +18,9 @@ import collections.abc import functools import inspect +import logging +import random +import string import sys import threading import time @@ -45,22 +48,37 @@ import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting from bigframes import clients +from bigframes.functions import _function_client, _utils, udf_def from bigframes.functions import function as bq_functions -from bigframes.functions import udf_def +from bigframes.functions._utils import ( + _BIGFRAMES_FUNCTION_PREFIX, + _BQ_FUNCTION_NAME_SEPERATOR, + _GCF_FUNCTION_NAME_SEPERATOR, +) if TYPE_CHECKING: - from bigframes.session import Session + from bigframes.session import Session, anonymous_dataset + +_DEFAULT_FUNCTION_MEMORY_MIB = 1024 -from bigframes.functions import _function_client, _utils + +logger = logging.getLogger(__name__) class FunctionSession: """Session to manage bigframes functions.""" - def __init__(self): - # Session level mapping of function artifacts - self._temp_artifacts: Dict[str, str] = dict() + def __init__( + self, + functions_client: _function_client.FunctionClient, + dataset_manager: anonymous_dataset.AnonymousDatasetManager, + default_connection: str, + location: str, + session_id: str, + ): + self._temp_cloud_functions: set[str] = set() + self._temp_remote_functions: set[bigquery.RoutineReference] = set() # Lock to synchronize the update of the session artifacts self._artifacts_lock = threading.Lock() @@ -68,107 +86,61 @@ def __init__(self): self._deployed_routines: set[bytes] = set() self._deploying_routines: set[bytes] = set() - def _resolve_session(self, session: Optional[Session]) -> Session: - """Resolves the BigFrames session.""" - import bigframes.pandas as bpd - import bigframes.session - - # Using the global session if none is provided. - return cast(bigframes.session.Session, session or bpd.get_global_session()) - - def _resolve_bigquery_client( - self, session: Session, bigquery_client: Optional[bigquery.Client] - ) -> bigquery.Client: - """Resolves the BigQuery client.""" - if not bigquery_client: - bigquery_client = session.bqclient - if not bigquery_client: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "A bigquery client must be provided, either directly or via session.", - ) - return bigquery_client + self._function_client: _function_client.FunctionClient = functions_client + self._dataset_manager: anonymous_dataset.AnonymousDatasetManager = ( + dataset_manager + ) + self._default_connection: str = default_connection + self._location: str = location + self._session_id: str = session_id - def _resolve_bigquery_connection_client( - self, - session: Session, - bigquery_connection_client: Optional[ - bigquery_connection_v1.ConnectionServiceClient - ], - ) -> bigquery_connection_v1.ConnectionServiceClient: - """Resolves the BigQuery connection client.""" - if not bigquery_connection_client: - bigquery_connection_client = session.bqconnectionclient - if not bigquery_connection_client: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "A bigquery connection client must be provided, either " - "directly or via session.", - ) - return bigquery_connection_client + @property + def session_id(self) -> str: + return self._session_id - def _resolve_resource_manager_client( - self, - session: Session, - resource_manager_client: Optional[resourcemanager_v3.ProjectsClient], - ) -> resourcemanager_v3.ProjectsClient: - """Resolves the resource manager client.""" - if not resource_manager_client: - resource_manager_client = session.resourcemanagerclient - if not resource_manager_client: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "A resource manager client must be provided, either directly " - "or via session.", - ) - return resource_manager_client + @property + def default_dataset(self) -> bigquery.DatasetReference: + # We defer this as a property since this can actually take a query to determine + # which dataset it is. + return self._dataset_manager.dataset def _resolve_dataset_reference( self, - session: Session, - bigquery_client: bigquery.Client, dataset: Optional[str], ) -> bigquery.DatasetReference: - """Resolves the dataset reference for the bigframes function.""" - if dataset: - dataset_ref = bigquery.DatasetReference.from_string( - dataset, default_project=bigquery_client.project + """ + Resolves the dataset reference for the bigframes function. + """ + return ( + bigquery.DatasetReference.from_string( + dataset, default_project=self.default_dataset.project ) - else: - dataset_ref = session._anonymous_dataset - return dataset_ref + if dataset + else self.default_dataset + ) - def _resolve_cloud_functions_client( + def _resolve_routine_reference( self, - session: Session, - cloud_functions_client: Optional[functions_v2.FunctionServiceClient], - ) -> Optional[functions_v2.FunctionServiceClient]: - """Resolves the Cloud Functions client.""" - if not cloud_functions_client: - cloud_functions_client = session.cloudfunctionsclient - if not cloud_functions_client: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "A cloud functions client must be provided, either directly " - "or via session.", - ) - return cloud_functions_client + function_name: str, + dataset: Optional[bigquery.DatasetReference]=None, + ) -> bigquery.RoutineReference: + """Resolves the routine reference for a BQ routine.""" + dataset_ref = dataset if dataset else self.default_dataset + return dataset_ref.routine(function_name) def _resolve_bigquery_connection_id( self, - session: Session, dataset_ref: bigquery.DatasetReference, - bq_location: str, bigquery_connection: Optional[str] = None, ) -> str: """Resolves BigQuery connection id.""" if not bigquery_connection: - bigquery_connection = session.bq_connection # type: ignore + bigquery_connection = self._default_connection bigquery_connection = clients.get_canonical_bq_connection_id( bigquery_connection, default_project=dataset_ref.project, - default_location=bq_location, + default_location=self._location, ) # Guaranteed to be the form of .. ( @@ -182,51 +154,50 @@ def _resolve_bigquery_connection_id( "The project_id does not match BigQuery connection " f"gcp_project_id: {dataset_ref.project}.", ) - if bq_connection_location.casefold() != bq_location.casefold(): + if bq_connection_location.casefold() != self._location.casefold(): raise bf_formatting.create_exception_with_feedback_link( ValueError, "The location does not match BigQuery connection location: " - f"{bq_location}.", + f"{self._location}.", ) return bq_connection_id - def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): - """Update function artifacts in the current session.""" + def _add_temp_cloud_function(self, gcf_path: str): with self._artifacts_lock: - self._temp_artifacts[bqrf_routine] = gcf_path + self._temp_cloud_functions.add(gcf_path) + + def _add_temp_remote_function(self, bqrf_routine: bigquery.RoutineReference): + with self._artifacts_lock: + self._temp_remote_functions.add(bqrf_routine) + + def _deploy_managed_function( + self, + config: udf_def.ManagedFunctionConfig, + name: str, + temp: bool, + dataset: Optional[bigquery.DatasetReference]=None + ) -> udf_def.BigqueryUdf: + routine_ref = self._resolve_routine_reference(name, dataset=dataset) + if temp: + self._add_temp_remote_function(routine_ref) + self._function_client.provision_bq_managed_function( + routine_ref=routine_ref, config=config + ) + return udf_def.BigqueryUdf( + routine_ref=routine_ref, + signature=config.signature, + ) def _deploy_udf( self, - session: Session, bq_udf: udf_def.PythonUdf, ) -> udf_def.BigqueryUdf: """Deploys a UDF to BigQuery if not already deployed.""" udf_hash = bq_udf.stable_hash() - bigquery_client = self._resolve_bigquery_client(session, None) - bq_connection_manager = session.bqconnectionmanager - dataset_ref = self._resolve_dataset_reference(session, bigquery_client, None) - bq_location, _ = _utils.get_remote_function_locations(bigquery_client.location) - - managed_function_client = _function_client.FunctionClient( - dataset_ref.project, - bq_location, - dataset_ref.dataset_id, - bigquery_client, - bq_connection_manager, - session=session, - ) - config = bq_udf.to_managed_function_config() - bq_function_name = _function_client.get_managed_function_name( - config, session.session_id - ) - full_rf_name = ( - managed_function_client.get_remote_function_fully_qualilfied_name( - bq_function_name - ) - ) - routine_ref = bigquery.RoutineReference.from_string(full_rf_name) + bq_function_name = get_managed_function_name(config, self.session_id) + routine_ref = self._resolve_routine_reference(bq_function_name) with self._artifacts_lock: if udf_hash in self._deployed_routines: @@ -247,12 +218,11 @@ def _deploy_udf( self._deploying_routines.add(udf_hash) break - time.sleep(0.2) + time.sleep(0.1) try: - managed_function_client.provision_bq_managed_function( - name=bq_function_name, - config=config, + self._function_client.provision_bq_managed_function( + routine_ref=routine_ref, config=config ) except Exception: with self._artifacts_lock: @@ -262,35 +232,23 @@ def _deploy_udf( with self._artifacts_lock: self._deploying_routines.discard(udf_hash) self._deployed_routines.add(udf_hash) - self._temp_artifacts[full_rf_name] = "" + self._add_temp_remote_function(routine_ref) return udf_def.BigqueryUdf( routine_ref=routine_ref, signature=bq_udf.signature, ) - def clean_up( - self, - bqclient: bigquery.Client, - gcfclient: functions_v2.FunctionServiceClient, - session_id: str, - ): + def clean_up(self): """Delete function artifacts in the current session.""" with self._artifacts_lock: - for bqrf_routine, gcf_path in self._temp_artifacts.items(): - # Let's accept the possibility that the function may have been - # deleted directly by the user - bqclient.delete_routine(bqrf_routine, not_found_ok=True) + for bqrf_routine in self._temp_remote_functions: + self._function_client.delete_routine(bqrf_routine) + for gcf_name in self._temp_cloud_functions: + self._function_client.delete_cloud_function(gcf_name) - if gcf_path: - # Let's accept the possibility that the cloud function may - # have been deleted directly by the user - try: - gcfclient.delete_function(name=gcf_path) - except google.api_core.exceptions.NotFound: - pass - - self._temp_artifacts.clear() + self._temp_remote_functions.clear() + self._temp_cloud_functions.clear() # Inspired by @udf decorator implemented in ibis-bigquery package # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py @@ -301,13 +259,6 @@ def remote_function( *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, - session: Optional[Session] = None, - bigquery_client: Optional[bigquery.Client] = None, - bigquery_connection_client: Optional[ - bigquery_connection_v1.ConnectionServiceClient - ] = None, - cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, - resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, reuse: bool = True, @@ -393,24 +344,6 @@ def remote_function( be specified. The supported output types are `bool`, `bytes`, `float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]` and `list[str]`. - session (bigframes.Session, Optional): - BigQuery DataFrames session to use for getting default project, - dataset and BigQuery connection. - bigquery_client (google.cloud.bigquery.Client, Optional): - Client to use for BigQuery operations. If this param is not provided - then bigquery client from the session would be used. - bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): - Client to use for BigQuery connection operations. If this param is - not provided then bigquery connection client from the session would - be used. - cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): - Client to use for cloud functions operations. If this param is not - provided then the functions client from the session would be used. - resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): - Client to use for cloud resource management operations, e.g. for - getting and setting IAM roles on cloud resources. If this param is - not provided then resource manager client from the session would be - used. dataset (str, Optional): Dataset in which to create a BigQuery remote function. It should be in `.` or `` format. If this @@ -541,9 +474,6 @@ def remote_function( https://cloud.google.com/build/docs/cloud-build-service-account for more details. """ - # Some defaults may be used from the session if not provided otherwise. - session = self._resolve_session(session) - # If the user forces the cloud function service argument to None, throw # an exception if cloud_function_service_account is None: @@ -551,36 +481,14 @@ def remote_function( 'You must provide a user managed cloud_function_service_account, or "default" if you would like to let the default service account be used.' ) - # A BigQuery client is required to perform BQ operations. - bigquery_client = self._resolve_bigquery_client(session, bigquery_client) - - # A BigQuery connection client is required for BQ connection operations. - bigquery_connection_client = self._resolve_bigquery_connection_client( - session, bigquery_connection_client - ) - - # A resource manager client is required to get/set IAM operations. - resource_manager_client = self._resolve_resource_manager_client( - session, resource_manager_client - ) - # BQ remote function must be persisted, for which we need a dataset. # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. - dataset_ref = self._resolve_dataset_reference(session, bigquery_client, dataset) - - # A cloud functions client is required for cloud functions operations. - cloud_functions_client = self._resolve_cloud_functions_client( - session, cloud_functions_client - ) - - bq_location, cloud_function_region = _utils.get_remote_function_locations( - bigquery_client.location - ) - + dataset_ref = self._resolve_dataset_reference(dataset) + cloud_function_region = _utils.gcf_location_from_bq_location(self._location) # A connection is required for BQ remote function. # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function bq_connection_id = self._resolve_bigquery_connection_id( - session, dataset_ref, bq_location, bigquery_connection + dataset_ref, bigquery_connection ) # If any CMEK is intended then check that a docker repository is also specified. @@ -611,11 +519,10 @@ def remote_function( ) warnings.warn(msg, category=UserWarning, stacklevel=2) - bq_connection_manager = session.bqconnectionmanager - def wrapper(func): nonlocal input_types, output_type + ### Step 1: Validate inputs and package into cloud run function, remote function defs. ### if not callable(func): raise bf_formatting.create_exception_with_feedback_link( TypeError, f"func must be a callable, got {func}" @@ -629,90 +536,120 @@ def wrapper(func): else: signature_kwargs = {} # type: ignore - py_sig = inspect.signature( - func, - **signature_kwargs, - ) - py_sig = _resolve_signature(py_sig, input_types, output_type) - - remote_function_client = _function_client.FunctionClient( - dataset_ref.project, - bq_location, - dataset_ref.dataset_id, - bigquery_client, - bq_connection_manager, - cloud_function_region, - cloud_functions_client, - None - if cloud_function_service_account == "default" - else cloud_function_service_account, - cloud_function_kms_key_name, - cloud_function_docker_repository, - cloud_build_service_account=cloud_build_service_account, - session=session, # type: ignore + py_sig = _resolve_signature( + inspect.signature(func, **signature_kwargs), + input_types, + output_type, ) udf_sig = udf_def.UdfSignature.from_py_signature( py_sig ).to_remote_function_compatible() - ( - rf_name, - cf_name, - created_new, - ) = remote_function_client.provision_bq_remote_function( - func, - func_signature=udf_sig, - reuse=reuse or False, - name=name, - package_requirements=tuple(packages) if packages else tuple(), - max_batching_rows=max_batching_rows or 1000, - cloud_function_timeout=cloud_function_timeout, - cloud_function_max_instance_count=cloud_function_max_instances, - cloud_function_vpc_connector=cloud_function_vpc_connector, - cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, - cloud_function_memory_mib=cloud_function_memory_mib, - cloud_function_cpus=cloud_function_cpus, - cloud_function_ingress_settings=cloud_function_ingress_settings, - bq_connection_id=bq_connection_id, + full_package_requirements = _utils.get_updated_package_requirements( + packages or [], udf_sig.is_row_processor ) - - bigframes_cloud_function = ( - remote_function_client.get_cloud_function_fully_qualified_name(cf_name) + memory_mib = cloud_function_memory_mib or _DEFAULT_FUNCTION_MEMORY_MIB + + # assumption is most bigframes functions are cpu bound, single-threaded and many won't release GIL + # therefore, want to allocate a worker for each cpu, and allow a concurrent request per worker + expected_milli_cpus = ( + int(cloud_function_cpus * 1000) + if (cloud_function_cpus is not None) + else _infer_milli_cpus_from_memory(memory_mib) ) - bigframes_bigquery_function = ( - remote_function_client.get_remote_function_fully_qualilfied_name( - rf_name + workers = -( + expected_milli_cpus // -1000 + ) # ceil(cpus) without invoking floats + threads = 4 # (per worker) + # max concurrency==1 for vcpus < 1 hard limit from cloud run + concurrency = (workers * threads) if (expected_milli_cpus >= 1000) else 1 + + ### Step 1: Create resources or fetch existing matching resources. ### + cloud_func_spec = udf_def.CloudRunFunctionConfig( + code=udf_def.CodeDef.from_func(func, full_package_requirements), + signature=udf_sig, + timeout_seconds=cloud_function_timeout, + max_instance_count=cloud_function_max_instances, + vpc_connector=cloud_function_vpc_connector, + vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings + or "private-ranges-only", + memory_mib=memory_mib, + cpus=cloud_function_cpus, + ingress_settings=cloud_function_ingress_settings, + workers=workers, + threads=threads, + concurrency=concurrency, + kms_key_name=cloud_function_kms_key_name, + docker_repository=cloud_function_docker_repository, + cloud_build_service_account=cloud_build_service_account, + cloud_run_service_account=cloud_function_service_account, + ) + uniq_suffix = None + if not reuse: + uniq_suffix = "".join( + random.choices(string.ascii_lowercase + string.digits, k=4) ) + cf_name = get_cloud_function_name( + cloud_func_spec, + # only session scope a temp unnamed function + session_id=self.session_id if (name is None) else None, + uniq_suffix=uniq_suffix, ) + if not name: + self._add_temp_cloud_function(cf_name) - # If a new remote function was created, update the cloud artifacts - # created in the session. This would be used to clean up any - # resources in the session. Note that we need to do this only for - # the case where an explicit name was not provided by the user and - # we used an internal name. For the cases where the user provided an - # explicit name, we are assuming that the user wants to persist them - # with that name and would directly manage their lifecycle. - if created_new and (not name): - self._update_temp_artifacts( - bigframes_bigquery_function, bigframes_cloud_function + # Create remote function that points at the cloud function + cf_endpoint = None + if reuse is not None: + cf_endpoint = self._function_client.get_cloud_function_endpoint(cf_name) + + if cf_endpoint is None: + cf_endpoint = self._function_client.create_cloud_function( + cf_name, cloud_func_spec ) + else: + logger.info(f"Cloud function {cf_name} already exists.") + + remote_function_config = udf_def.RemoteFunctionConfig( + endpoint=cf_endpoint, + connection_id=bq_connection_id, + max_batching_rows=max_batching_rows or 1000, + signature=udf_sig, + bq_metadata=udf_sig.protocol_metadata, + ) + remote_function_name = name or get_bigframes_function_name( + remote_function_config, + session_id=self.session_id, + uniq_suffix=uniq_suffix, + ) + routine_ref = self._resolve_routine_reference(remote_function_name, dataset=dataset_ref) + if not name: + self._add_temp_remote_function(routine_ref) + + self._function_client.create_bq_remote_function( + udf_def=remote_function_config, + routine_ref=routine_ref, + maybe_reuse=reuse, + ) udf_definition = udf_def.BigqueryUdf( - routine_ref=bigquery.RoutineReference.from_string( - bigframes_bigquery_function - ), + routine_ref=routine_ref, signature=udf_sig, ) decorator = functools.wraps(func) if udf_sig.is_row_processor: msg = bfe.format_message("input_types=Series is in preview.") warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) + + cf_full_path = ( + self._function_client.get_cloud_function_fully_qualified_name(cf_name) + ) return decorator( bq_functions.BigqueryCallableRoutine( udf_definition, - session, - cloud_function_ref=bigframes_cloud_function, + self._function_client._bq_client, + cloud_function_ref=cf_full_path, local_func=func, is_managed=False, ) @@ -748,8 +685,6 @@ def udf( self, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, - session: Optional[Session] = None, - bigquery_client: Optional[bigquery.Client] = None, dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, name: Optional[str] = None, @@ -787,12 +722,6 @@ def udf( be specified. The supported output types are `bool`, `bytes`, `float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]` and `list[str]`. - session (bigframes.Session, Optional): - BigQuery DataFrames session to use for getting default project, - dataset and BigQuery connection. - bigquery_client (google.cloud.bigquery.Client, Optional): - Client to use for BigQuery operations. If this param is not - provided, then bigquery client from the session would be used. dataset (str, Optional): Dataset in which to create a BigQuery managed function. It should be in `.` or `` @@ -844,29 +773,16 @@ def udf( """ warnings.warn("udf is in preview.", category=bfe.PreviewWarning, stacklevel=5) - - # Some defaults may be used from the session if not provided otherwise. - session = self._resolve_session(session) - - # A BigQuery client is required to perform BQ operations. - bigquery_client = self._resolve_bigquery_client(session, bigquery_client) - # BQ managed function must be persisted, for which we need a dataset. - dataset_ref = self._resolve_dataset_reference(session, bigquery_client, dataset) - - bq_location, _ = _utils.get_remote_function_locations(bigquery_client.location) + dataset_ref = self._resolve_dataset_reference(dataset) # A connection is optional for BQ managed function. bq_connection_id = ( - self._resolve_bigquery_connection_id( - session, dataset_ref, bq_location, bigquery_connection - ) + self._resolve_bigquery_connection_id(dataset_ref, bigquery_connection) if bigquery_connection else None ) - bq_connection_manager = session.bqconnectionmanager - # TODO(b/399129906): Write a method for the repeated part in the wrapper # for both managed function and remote function. def wrapper(func): @@ -893,28 +809,9 @@ def wrapper(func): # The function will actually be receiving a pandas Series, but allow # both BigQuery DataFrames and pandas object types for compatibility. - udf_sig = udf_def.UdfSignature.from_py_signature(py_sig) - managed_function_client = _function_client.FunctionClient( - dataset_ref.project, - bq_location, - dataset_ref.dataset_id, - bigquery_client, - bq_connection_manager, - session=session, # type: ignore - ) code_def = udf_def.CodeDef.from_func(func, package_requirements=packages) - config = udf_def.ManagedFunctionConfig( - code=code_def, - signature=udf_sig, - max_batching_rows=max_batching_rows, - container_cpu=container_cpu, - container_memory=container_memory, - bq_connection_id=bq_connection_id, - capture_references=False, - ) - requirements = udf_def.RuntimeRequirements( container_cpu=container_cpu, container_memory=container_memory, @@ -927,7 +824,7 @@ def wrapper(func): warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) if ( - not name and not _force_deploy + not name and not dataset and not _force_deploy ): # session-owned resource - deferred deployment udf_definition = udf_def.PythonUdf( signature=udf_sig, @@ -936,28 +833,29 @@ def wrapper(func): ) return bq_functions.UdfRoutine(func=func, _udf_def=udf_definition) else: # deploy immediately - bq_function_name = ( - managed_function_client.provision_bq_managed_function( - name=name, - config=config, - ) + config = udf_def.ManagedFunctionConfig( + code=code_def, + signature=udf_sig, + max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, + bq_connection_id=bq_connection_id, + capture_references=False, ) - full_rf_name = ( - managed_function_client.get_remote_function_fully_qualilfied_name( - bq_function_name - ) + function_name = name or get_managed_function_name( + config, self.session_id ) - rf_def = udf_def.BigqueryUdf( - routine_ref=bigquery.RoutineReference.from_string(full_rf_name), - signature=udf_sig, + rf_def = self._deploy_managed_function( + config, + name=function_name, + temp=(name is None), + dataset=dataset_ref, ) - if name is None: - # Null name means anonymous, session-owned resource with force deploy. - # Unnamed resources are owned by the session and will be cleaned up automatically. - self._update_temp_artifacts(full_rf_name, "") - return bq_functions.BigqueryCallableRoutine( - rf_def, session, local_func=func, is_managed=True + rf_def, + self._function_client._bq_client, + local_func=func, + is_managed=True, ) return wrapper @@ -1015,3 +913,66 @@ def _resolve_signature( py_sig = py_sig.replace(return_annotation=output_type) return py_sig + + +def get_cloud_function_name( + function_def: udf_def.CloudRunFunctionConfig, session_id=None, uniq_suffix=False +): + """ + Get a name for the cloud function for the given user defined function. + + If make_unique is True, append a random suffix to the name. + """ + parts = [_BIGFRAMES_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_def.stable_hash().hex()) + if uniq_suffix: + parts.append(uniq_suffix) + return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) + + +def get_bigframes_function_name( + function: udf_def.RemoteFunctionConfig, session_id, uniq_suffix=None +): + """Get a name for the bigframes function for the given user defined function.""" + parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function.stable_hash().hex()] + if uniq_suffix: + parts.append(uniq_suffix) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + +def get_managed_function_name( + function_def: udf_def.ManagedFunctionConfig, + session_id: str | None = None, +): + """Get a name for the bigframes managed function for the given user defined function.""" + parts = [_BIGFRAMES_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_def.stable_hash().hex()) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + +def _infer_milli_cpus_from_memory(memory_mib: int) -> int: + # observed values, not formally documented by cloud run functions + if memory_mib < 128: + raise ValueError("Cloud run supports at minimum 128MiB per instance") + elif memory_mib == 128: + return 83 + elif memory_mib <= 256: + return 167 + elif memory_mib <= 512: + return 333 + elif memory_mib <= 1024: + return 583 + elif memory_mib <= 2048: + return 1000 + elif memory_mib <= 8192: + return 2000 + elif memory_mib <= 16384: + return 4000 + elif memory_mib <= 32768: + return 8000 + else: + raise ValueError("Cloud run supports at most 32768MiB per instance") diff --git a/packages/bigframes/bigframes/functions/_utils.py b/packages/bigframes/bigframes/functions/_utils.py index 8bafc8766c79..36736cd6bd77 100644 --- a/packages/bigframes/bigframes/functions/_utils.py +++ b/packages/bigframes/bigframes/functions/_utils.py @@ -43,25 +43,19 @@ _pickle_protocol_version = 4 -def get_remote_function_locations(bq_location): - """Get BQ location and cloud functions region given a BQ client.""" - # TODO(shobs, b/274647164): Find the best way to determine default location. - # For now let's assume that if no BQ location is set in the client then it - # defaults to US multi region - bq_location = bq_location.lower() if bq_location else "us" - - # Cloud function should be in the same region as the bigquery remote function - cloud_function_region = bq_location +def gcf_location_from_bq_location(bq_location: str) -> str: + """Get the cloud functions region that corresponds to a BQ location.""" + bq_location = bq_location.lower() # BigQuery has multi region but cloud functions does not. # Any region in the multi region that supports cloud functions should work # https://cloud.google.com/functions/docs/locations if bq_location == "us": - cloud_function_region = "us-central1" + return "us-central1" elif bq_location == "eu": - cloud_function_region = "europe-west1" + return "europe-west1" - return bq_location, cloud_function_region + return bq_location def _package_existed(package_requirements: list[str], package: str) -> bool: @@ -164,7 +158,7 @@ def clean_up_by_session_id( # Now clean up the cloud functions bq_location = bqclient.get_dataset(dataset).location - bq_location, gcf_location = get_remote_function_locations(bq_location) + gcf_location = gcf_location_from_bq_location(bq_location) parent_path = gcfclient.common_location_path( project=dataset.project, location=gcf_location ) diff --git a/packages/bigframes/bigframes/functions/function.py b/packages/bigframes/bigframes/functions/function.py index e9a40f415324..242c9a850d13 100644 --- a/packages/bigframes/bigframes/functions/function.py +++ b/packages/bigframes/bigframes/functions/function.py @@ -65,7 +65,9 @@ def get_routine_reference( def remote_function(*args, **kwargs): - function_session = bff_session.FunctionSession() + import bigframes + + function_session = bigframes.get_global_session()._function_session return function_session.remote_function(*args, **kwargs) @@ -73,7 +75,9 @@ def remote_function(*args, **kwargs): def udf(*args, **kwargs): - function_session = bff_session.FunctionSession() + import bigframes + + function_session = bigframes.get_global_session()._function_session return function_session.udf(*args, **kwargs) @@ -81,24 +85,24 @@ def udf(*args, **kwargs): def _try_import_routine( - routine: bigquery.Routine, session: bigframes.Session + routine: bigquery.Routine, bq_client: bigquery.Client ) -> BigqueryCallableRoutine: udf_def = _routine_as_udf_def(routine) is_remote = ( hasattr(routine, "remote_function_options") and routine.remote_function_options ) - return BigqueryCallableRoutine(udf_def, session, is_managed=not is_remote) + return BigqueryCallableRoutine(udf_def, bq_client, is_managed=not is_remote) def _try_import_row_routine( - routine: bigquery.Routine, session: bigframes.Session + routine: bigquery.Routine, bq_client: bigquery.Client ) -> BigqueryCallableRoutine: udf_def = _routine_as_udf_def(routine, is_row_processor=True) is_remote = ( hasattr(routine, "remote_function_options") and routine.remote_function_options ) - return BigqueryCallableRoutine(udf_def, session, is_managed=not is_remote) + return BigqueryCallableRoutine(udf_def, bq_client, is_managed=not is_remote) def _routine_as_udf_def( @@ -148,9 +152,9 @@ def read_gbq_function( # TODO(493293086): Deprecate is_row_processor. if is_row_processor: - return _try_import_row_routine(routine, session) + return _try_import_row_routine(routine, bigquery_client) else: - return _try_import_routine(routine, session) + return _try_import_routine(routine, bigquery_client) @runtime_checkable @@ -175,14 +179,14 @@ class BigqueryCallableRoutine: def __init__( self, udf_def: udf_def.BigqueryUdf, - session: bigframes.Session, + bq_client: bigquery.Client, *, local_func: Optional[Callable] = None, cloud_function_ref: Optional[str] = None, is_managed: bool = False, ): self._udf_def = udf_def - self._session = session + self._bq_client = bq_client self._local_fun = local_func self._cloud_function = cloud_function_ref self._is_managed = is_managed @@ -196,13 +200,12 @@ def __call__(self, *args, **kwargs): args_string = ", ".join([sg_sql.to_sql(sg_sql.literal(v)) for v in args]) sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})" - iter, job = bf_io_bigquery.start_query_with_job( - self._session.bqclient, + row_iterator = bf_io_bigquery.start_query_job_optional( + self._bq_client, sql=sql, job_config=bigquery.QueryJobConfig(), - publisher=self._session._publisher, ) # type: ignore - return list(iter.to_arrow().to_pydict().values())[0][0] + return list(row_iterator.to_arrow().to_pydict().values())[0][0] @property def bigframes_bigquery_function(self) -> str: diff --git a/packages/bigframes/bigframes/functions/udf_def.py b/packages/bigframes/bigframes/functions/udf_def.py index b95dafc4253b..70e0406a6f6c 100644 --- a/packages/bigframes/bigframes/functions/udf_def.py +++ b/packages/bigframes/bigframes/functions/udf_def.py @@ -559,6 +559,10 @@ class CloudRunFunctionConfig: workers: int | None threads: int | None concurrency: int | None + kms_key_name: str | None + docker_repository: str | None + cloud_build_service_account: str | None + cloud_run_service_account: str | None def stable_hash(self) -> bytes: hash_val = google_crc32c.Checksum() @@ -574,6 +578,10 @@ def stable_hash(self) -> bytes: hash_val.update(str(self.workers).encode()) hash_val.update(str(self.threads).encode()) hash_val.update(str(self.concurrency).encode()) + hash_val.update(str(self.kms_key_name).encode()) + hash_val.update(str(self.docker_repository).encode()) + hash_val.update(str(self.cloud_build_service_account).encode()) + hash_val.update(str(self.cloud_run_service_account).encode()) return hash_val.digest() diff --git a/packages/bigframes/bigframes/session/__init__.py b/packages/bigframes/bigframes/session/__init__.py index d1bdc3854e46..bccf0fe32cbe 100644 --- a/packages/bigframes/bigframes/session/__init__.py +++ b/packages/bigframes/bigframes/session/__init__.py @@ -71,7 +71,6 @@ import bigframes.core.indexes.multi import bigframes.core.pyformat import bigframes.formatting_helpers -import bigframes.functions._function_session as bff_session import bigframes.functions.function as bff import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session.clients @@ -80,6 +79,7 @@ from bigframes import version from bigframes.core import blocks, utils from bigframes.core.logging import log_adapter +from bigframes.functions import _function_client, _function_session from bigframes.session import bigquery_session, executor, proxy_executor # Avoid circular imports. @@ -198,6 +198,7 @@ def __init__( # this path is only for unit testing. Not meant to be used by end users. self._clients_provider = clients_provider self._location = context.location or "US" + project = "test_project" else: credentials, project = ( bigframes._config.auth.resolve_credentials_and_project(context) @@ -292,7 +293,6 @@ def __init__( self._metrics = metrics.ExecutionMetrics() self._publisher.subscribe(self._metrics.on_event) - self._function_session = bff_session.FunctionSession() self._anon_dataset_manager = anonymous_dataset.AnonymousDatasetManager( self._clients_provider.bqclient, location=self._location, @@ -300,6 +300,20 @@ def __init__( kms_key=self._bq_kms_key_name, publisher=self._publisher, ) + self._function_session = _function_session.FunctionSession( + _function_client.FunctionClient( + gcp_project_id=project, + bq_location=self._location, + bq_client=self._clients_provider.bqclient, + bq_connection_manager=self._clients_provider.bqconnectionclient, + cloud_functions_client=self._clients_provider.cloudfunctionsclient, + publisher=self._publisher, + ), + dataset_manager=self._anon_dataset_manager, + default_connection=self._bq_connection, + location=self._location, + session_id=self._session_id, + ) # Session temp tables don't support specifying kms key, so use anon dataset if kms key specified self._session_resource_manager = ( bigquery_session.SessionResourceManager( @@ -475,9 +489,7 @@ def close(self): remote_function_session = getattr(self, "_function_session", None) if remote_function_session: - remote_function_session.clean_up( - self.bqclient, self.cloudfunctionsclient, self.session_id - ) + remote_function_session.clean_up() publisher_session = getattr(self, "_publisher", None) if publisher_session: @@ -1646,13 +1658,6 @@ def deploy_remote_function( """ return self._function_session.deploy_remote_function( func, - # Session-provided arguments. - session=self, - bigquery_client=self._clients_provider.bqclient, - bigquery_connection_client=self._clients_provider.bqconnectionclient, - cloud_functions_client=self._clients_provider.cloudfunctionsclient, - resource_manager_client=self._clients_provider.resourcemanagerclient, - # User-provided arguments. **kwargs, ) @@ -1893,12 +1898,6 @@ def remote_function( `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ return self._function_session.remote_function( - # Session-provided arguments. - session=self, - bigquery_client=self._clients_provider.bqclient, - bigquery_connection_client=self._clients_provider.bqconnectionclient, - cloud_functions_client=self._clients_provider.cloudfunctionsclient, - resource_manager_client=self._clients_provider.resourcemanagerclient, # User-provided arguments. input_types=input_types, output_type=output_type, @@ -1945,10 +1944,6 @@ def deploy_udf( """ return self._function_session.deploy_udf( func, - # Session-provided arguments. - session=self, - bigquery_client=self._clients_provider.bqclient, - # User-provided arguments. **kwargs, ) @@ -2109,10 +2104,6 @@ def udf( deployed for the user defined code. """ return self._function_session.udf( - # Session-provided arguments. - session=self, - bigquery_client=self._clients_provider.bqclient, - # User-provided arguments. input_types=input_types, output_type=output_type, dataset=dataset, diff --git a/packages/bigframes/bigframes/session/_io/bigquery/__init__.py b/packages/bigframes/bigframes/session/_io/bigquery/__init__.py index 17534e59273d..8f76f2120ddb 100644 --- a/packages/bigframes/bigframes/session/_io/bigquery/__init__.py +++ b/packages/bigframes/bigframes/session/_io/bigquery/__init__.py @@ -355,7 +355,7 @@ def start_query_job_optional( # https://github.com/googleapis/python-bigquery/pull/2256 merged, likely # version 3.36.0 or later. job_retry: google.api_core.retry.Retry = (third_party_gcb_retry.DEFAULT_JOB_RETRY), # noqa: E501 - publisher: bigframes.core.events.Publisher, + publisher: Optional[bigframes.core.events.Publisher] = None, session=None, ) -> google.cloud.bigquery.table.RowIterator: """ @@ -373,7 +373,7 @@ def start_query_job_optional( project=project, api_timeout=timeout, job_retry=job_retry, - callback=create_bq_event_callback(publisher), + callback=create_bq_event_callback(publisher) if publisher else lambda _: None, ) if metrics is not None: metrics.count_job_stats(row_iterator=results_iterator) diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index c8f6b45e00e8..9cf2234a8111 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -532,7 +532,6 @@ async def _deploy_undeployed_udfs( tasks = [ asyncio.to_thread( self._function_manager._deploy_udf, - session, udf, ) for udf in referenced_udfs diff --git a/packages/bigframes/bigframes/testing/mocks.py b/packages/bigframes/bigframes/testing/mocks.py index 1c55ee5f4c7b..f8ad43dd6643 100644 --- a/packages/bigframes/bigframes/testing/mocks.py +++ b/packages/bigframes/bigframes/testing/mocks.py @@ -148,6 +148,7 @@ def query_and_wait_mock(query, *args, job_config=None, **kwargs): clients_provider = mock.create_autospec(bigframes.session.clients.ClientsProvider) type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient) clients_provider._credentials = credentials + clients_provider.project = anonymous_dataset.project bqoptions = bigframes.BigQueryOptions( credentials=credentials, diff --git a/packages/bigframes/bigframes/testing/utils.py b/packages/bigframes/bigframes/testing/utils.py index 50585759478c..b3b8ba1ab921 100644 --- a/packages/bigframes/bigframes/testing/utils.py +++ b/packages/bigframes/bigframes/testing/utils.py @@ -432,7 +432,7 @@ def get_cloud_functions( "Either 'name' or 'name_prefix' can be passed but not both." ) - _, location = bff_utils.get_remote_function_locations(location) + location = bff_utils.gcf_location_from_bq_location(location) parent = f"projects/{project}/locations/{location}" request = functions_v2.ListFunctionsRequest(parent=parent) page_result = functions_client.list_functions(request=request) diff --git a/packages/bigframes/tests/unit/functions/test_remote_function.py b/packages/bigframes/tests/unit/functions/test_remote_function.py index 17c04b338385..c38f9bde64f2 100644 --- a/packages/bigframes/tests/unit/functions/test_remote_function.py +++ b/packages/bigframes/tests/unit/functions/test_remote_function.py @@ -20,8 +20,8 @@ def test_missing_input_types(): session = mocks.create_bigquery_session() - remote_function_decorator = bff.remote_function( - session=session, cloud_function_service_account="default" + remote_function_decorator = session._function_session.remote_function( + cloud_function_service_account="default" ) def function_without_parameter_annotations(myparam) -> str: @@ -38,8 +38,8 @@ def function_without_parameter_annotations(myparam) -> str: def test_missing_output_type(): session = mocks.create_bigquery_session() - remote_function_decorator = bff.remote_function( - session=session, cloud_function_service_account="default" + remote_function_decorator = session._function_session.remote_function( + cloud_function_service_account="default" ) def function_without_return_annotation(myparam: int): diff --git a/packages/bigframes/tests/unit/functions/test_remote_function_utils.py b/packages/bigframes/tests/unit/functions/test_remote_function_utils.py index 5ca26fe96f6a..cbdb289e2650 100644 --- a/packages/bigframes/tests/unit/functions/test_remote_function_utils.py +++ b/packages/bigframes/tests/unit/functions/test_remote_function_utils.py @@ -23,22 +23,18 @@ @pytest.mark.parametrize( - ("input_location", "expected_bq_location", "expected_cf_region"), + ("input_location", "expected_cf_region"), [ - (None, "us", "us-central1"), - ("us", "us", "us-central1"), - ("eu", "eu", "europe-west1"), - ("US-east4", "us-east4", "us-east4"), + ("us", "us-central1"), + ("eu", "europe-west1"), + ("US-east4", "us-east4"), ], ) -def test_get_remote_function_locations( - input_location, expected_bq_location, expected_cf_region -): - """Tests getting remote function locations for various locations.""" - bq_location, cf_region = _utils.get_remote_function_locations(input_location) +def test_gcf_location_from_bq_location(input_location, expected_cf_region): + """Tests getting cloud function locations for various BigQuery locations.""" + gcf_location = _utils.gcf_location_from_bq_location(input_location) - assert bq_location == expected_bq_location - assert cf_region == expected_cf_region + assert gcf_location == expected_cf_region @patch("bigframes.functions._utils.numpy.__version__", "1.24.4") From 9f83747f162d1a7cd23d49590768bfcdc7c75612 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 27 May 2026 04:46:15 +0000 Subject: [PATCH 12/21] ruff --- .../bigframes/functions/_function_session.py | 14 ++++++++------ .../bigframes/session/_io/bigquery/__init__.py | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 054bcbe8cca9..44fc0aebff06 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -122,7 +122,7 @@ def _resolve_dataset_reference( def _resolve_routine_reference( self, function_name: str, - dataset: Optional[bigquery.DatasetReference]=None, + dataset: Optional[bigquery.DatasetReference] = None, ) -> bigquery.RoutineReference: """Resolves the routine reference for a BQ routine.""" dataset_ref = dataset if dataset else self.default_dataset @@ -171,11 +171,11 @@ def _add_temp_remote_function(self, bqrf_routine: bigquery.RoutineReference): self._temp_remote_functions.add(bqrf_routine) def _deploy_managed_function( - self, - config: udf_def.ManagedFunctionConfig, - name: str, + self, + config: udf_def.ManagedFunctionConfig, + name: str, temp: bool, - dataset: Optional[bigquery.DatasetReference]=None + dataset: Optional[bigquery.DatasetReference] = None, ) -> udf_def.BigqueryUdf: routine_ref = self._resolve_routine_reference(name, dataset=dataset) if temp: @@ -623,7 +623,9 @@ def wrapper(func): session_id=self.session_id, uniq_suffix=uniq_suffix, ) - routine_ref = self._resolve_routine_reference(remote_function_name, dataset=dataset_ref) + routine_ref = self._resolve_routine_reference( + remote_function_name, dataset=dataset_ref + ) if not name: self._add_temp_remote_function(routine_ref) diff --git a/packages/bigframes/bigframes/session/_io/bigquery/__init__.py b/packages/bigframes/bigframes/session/_io/bigquery/__init__.py index 8f76f2120ddb..5d985b6e107b 100644 --- a/packages/bigframes/bigframes/session/_io/bigquery/__init__.py +++ b/packages/bigframes/bigframes/session/_io/bigquery/__init__.py @@ -373,7 +373,9 @@ def start_query_job_optional( project=project, api_timeout=timeout, job_retry=job_retry, - callback=create_bq_event_callback(publisher) if publisher else lambda _: None, + callback=create_bq_event_callback(publisher) + if publisher + else lambda _: None, ) if metrics is not None: metrics.count_job_stats(row_iterator=results_iterator) From 28017edb53325a6e315c5960d881d688ae0e4ee6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 27 May 2026 05:08:07 +0000 Subject: [PATCH 13/21] fixes --- .../bigframes/functions/_function_client.py | 17 +++++++++-------- .../bigframes/bigframes/session/__init__.py | 4 ++-- .../large/functions/test_managed_function.py | 10 +++------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_client.py b/packages/bigframes/bigframes/functions/_function_client.py index 67ae8e3f7605..dba92ade3d7b 100644 --- a/packages/bigframes/bigframes/functions/_function_client.py +++ b/packages/bigframes/bigframes/functions/_function_client.py @@ -78,7 +78,7 @@ def __init__( gcp_project_id: str, bq_location: str, bq_client: bigquery.Client, - bq_connection_manager: bigquery_connection.BigQueryConnectionManager, + bq_connection_manager, cloud_functions_client: functions_v2.FunctionServiceClient, publisher, ): @@ -110,15 +110,15 @@ def _create_bq_connection( def _ensure_dataset_exists(self, dataset_ref: bigquery.DatasetReference) -> None: # Make sure the dataset exists, i.e. if it doesn't exist, go ahead and # create it. - dataset = bigquery.Dataset(dataset_ref) - dataset.location = self._bq_location try: # This check does not require bigquery.datasets.create IAM # permission. So, if the data set already exists, then user can work # without having that permission. - self._bq_client.get_dataset(dataset) + self._bq_client.get_dataset(dataset_ref) except google.api_core.exceptions.NotFound: # This requires bigquery.datasets.create IAM permission. + dataset = bigquery.Dataset(dataset_ref) + dataset.location = self._bq_location self._bq_client.create_dataset(dataset, exists_ok=True) def _create_bq_function(self, create_function_ddl: str) -> None: @@ -424,9 +424,10 @@ def create_cloud_function( function.build_config.source.storage_source.object_ = ( upload_url_response.storage_source.object_ ) - function.build_config.docker_repository = config.docker_repository + if config.docker_repository is not None: + function.build_config.docker_repository = config.docker_repository - if config.cloud_build_service_account: + if config.cloud_build_service_account is not None: canonical_cloud_build_service_account = ( config.cloud_build_service_account if "/" in config.cloud_build_service_account @@ -470,9 +471,9 @@ def create_cloud_function( functions_v2.ServiceConfig.VpcConnectorEgressSettings, _VPC_EGRESS_SETTINGS_MAP[vpc_connector_egress_settings], ) - if config.cloud_function_service_account: + if config.cloud_run_service_account: function.service_config.service_account_email = ( - config.cloud_function_service_account + config.cloud_run_service_account ) if config.concurrency: function.service_config.max_instance_request_concurrency = ( diff --git a/packages/bigframes/bigframes/session/__init__.py b/packages/bigframes/bigframes/session/__init__.py index bccf0fe32cbe..bd1613477e03 100644 --- a/packages/bigframes/bigframes/session/__init__.py +++ b/packages/bigframes/bigframes/session/__init__.py @@ -1952,7 +1952,7 @@ def udf( *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, - dataset: str, + dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, @@ -2046,7 +2046,7 @@ def udf( be specified. The supported output types are `bool`, `bytes`, `float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]` and `list[str]`. - dataset (str): + dataset (str, Optional): Dataset in which to create a BigQuery managed function. It should be in `.` or `` format. diff --git a/packages/bigframes/tests/system/large/functions/test_managed_function.py b/packages/bigframes/tests/system/large/functions/test_managed_function.py index e93d2bb068be..0aaae89db364 100644 --- a/packages/bigframes/tests/system/large/functions/test_managed_function.py +++ b/packages/bigframes/tests/system/large/functions/test_managed_function.py @@ -1130,28 +1130,25 @@ def foo_list(x: int, y0: float, y1: bytes, y2: bool) -> list[str]: pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) -def test_deferred_unnamed_udf_execution(session, dataset_id, scalars_dfs): +def test_deferred_unnamed_udf_execution(session, scalars_dfs): import bigframes.functions.udf_def as udf_def # Create an unnamed UDF (name=None) - @session.udf(dataset=dataset_id) + @session.udf() def unnamed_multiplier(x: int) -> int: return x * 3 - # 1. Assert it is represented as a PythonUdf (not deployed yet) assert isinstance(unnamed_multiplier.udf_def, udf_def.PythonUdf) scalars_df, scalars_pandas_df = scalars_dfs bf_series = scalars_df["int64_too"] pd_series = scalars_pandas_df["int64_too"] - # 2. Applying it triggers deployment behind the scenes! bf_result = bf_series.apply(unnamed_multiplier).to_pandas() pd_result = pd_series.apply(lambda x: x * 3) pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) - # 3. Verify that the deployed routine name matches our stable hash and exists in BigQuery import bigframes.functions._function_client as bff_client config = unnamed_multiplier.udf_def.to_managed_function_config() @@ -1164,12 +1161,11 @@ def unnamed_multiplier(x: int) -> int: assert routine is not None -def test_deferred_udf_with_runtime_requirements(session, dataset_id, scalars_dfs): +def test_deferred_udf_with_runtime_requirements(session, scalars_dfs): import bigframes.functions.udf_def as udf_def # Create an unnamed UDF with custom options @session.udf( - dataset=dataset_id, container_cpu=1, container_memory="2Gi", max_batching_rows=25, From f0178253af6bff4893516b9b9b0cd888bc832f79 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 27 May 2026 05:10:20 +0000 Subject: [PATCH 14/21] fixes --- .../tests/system/large/functions/test_managed_function.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/bigframes/tests/system/large/functions/test_managed_function.py b/packages/bigframes/tests/system/large/functions/test_managed_function.py index 0aaae89db364..888852edd4d1 100644 --- a/packages/bigframes/tests/system/large/functions/test_managed_function.py +++ b/packages/bigframes/tests/system/large/functions/test_managed_function.py @@ -1149,10 +1149,10 @@ def unnamed_multiplier(x: int) -> int: pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) - import bigframes.functions._function_client as bff_client + import bigframes.functions._function_session as functions_sessions config = unnamed_multiplier.udf_def.to_managed_function_config() - expected_routine_name = bff_client.get_managed_function_name( + expected_routine_name = functions_sessions.get_managed_function_name( config, session.session_id ) routine = session.bqclient.get_routine( @@ -1185,10 +1185,10 @@ def heavy_unnamed_udf(x: int) -> int: pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) # Verify it was deployed with the correct runtime options - import bigframes.functions._function_client as bff_client + import bigframes.functions._function_session as functions_sessions config = heavy_unnamed_udf.udf_def.to_managed_function_config() - expected_routine_name = bff_client.get_managed_function_name( + expected_routine_name = functions_sessions.get_managed_function_name( config, session.session_id ) routine = session.bqclient.get_routine( From 4f6bd0820dcc88243ca2dd877f304d2c261bc49d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 27 May 2026 21:14:21 +0000 Subject: [PATCH 15/21] fixes --- .../bigframes/functions/_function_client.py | 2 +- .../bigframes/functions/_function_session.py | 19 +++----- .../bigframes/bigframes/operations/to_op.py | 2 +- .../bigframes/bigframes/session/__init__.py | 5 +- .../bigframes/session/bq_caching_executor.py | 48 +++++++++---------- 5 files changed, 36 insertions(+), 40 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_client.py b/packages/bigframes/bigframes/functions/_function_client.py index dba92ade3d7b..0f766f260f12 100644 --- a/packages/bigframes/bigframes/functions/_function_client.py +++ b/packages/bigframes/bigframes/functions/_function_client.py @@ -194,7 +194,7 @@ def create_bq_remote_function( _validate_routine_name(routine_ref.routine_id) bq_function_name_escaped = bigframes.core.sql.identifier(routine_ref.routine_id) create_function_ddl = f""" - CREATE OR REPLACE FUNCTION `{routine_ref.project}.{routine_ref.dataset_id}.{bq_function_name_escaped}`({udf_def.signature.to_sql_input_signature()}) + CREATE OR REPLACE FUNCTION `{routine_ref.project}.{routine_ref.dataset_id}`.{bq_function_name_escaped}({udf_def.signature.to_sql_input_signature()}) RETURNS {udf_def.signature.with_devirtualize().output.sql_type} REMOTE WITH CONNECTION `{routine_ref.project}.{self._bq_location}.{udf_def.connection_id}` OPTIONS ({remote_function_options_str})""" diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 44fc0aebff06..9d88acbfcf4d 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -198,14 +198,6 @@ def _deploy_udf( config = bq_udf.to_managed_function_config() bq_function_name = get_managed_function_name(config, self.session_id) routine_ref = self._resolve_routine_reference(bq_function_name) - - with self._artifacts_lock: - if udf_hash in self._deployed_routines: - return udf_def.BigqueryUdf( - routine_ref=routine_ref, - signature=bq_udf.signature, - ) - while True: with self._artifacts_lock: if udf_hash in self._deployed_routines: @@ -219,7 +211,6 @@ def _deploy_udf( break time.sleep(0.1) - try: self._function_client.provision_bq_managed_function( routine_ref=routine_ref, config=config @@ -228,12 +219,10 @@ def _deploy_udf( with self._artifacts_lock: self._deploying_routines.discard(udf_hash) raise - + self._add_temp_remote_function(routine_ref) with self._artifacts_lock: self._deploying_routines.discard(udf_hash) self._deployed_routines.add(udf_hash) - self._add_temp_remote_function(routine_ref) - return udf_def.BigqueryUdf( routine_ref=routine_ref, signature=bq_udf.signature, @@ -583,7 +572,11 @@ def wrapper(func): kms_key_name=cloud_function_kms_key_name, docker_repository=cloud_function_docker_repository, cloud_build_service_account=cloud_build_service_account, - cloud_run_service_account=cloud_function_service_account, + cloud_run_service_account=( + None + if (cloud_function_service_account == "default") + else cloud_function_service_account + ), ) uniq_suffix = None if not reuse: diff --git a/packages/bigframes/bigframes/operations/to_op.py b/packages/bigframes/bigframes/operations/to_op.py index c139541470d1..0abddf18ec7d 100644 --- a/packages/bigframes/bigframes/operations/to_op.py +++ b/packages/bigframes/bigframes/operations/to_op.py @@ -19,7 +19,7 @@ def func_to_op(op) -> base_ops.NaryOp: """ - Convert various bigframes, python objects into a bigframes operations. + Convert various bigframes, python functions into bigframes operations. This should handle anything that might be passed to eg map, combine, other pandas methods that take a function. diff --git a/packages/bigframes/bigframes/session/__init__.py b/packages/bigframes/bigframes/session/__init__.py index bd1613477e03..c6b84d773575 100644 --- a/packages/bigframes/bigframes/session/__init__.py +++ b/packages/bigframes/bigframes/session/__init__.py @@ -305,7 +305,10 @@ def __init__( gcp_project_id=project, bq_location=self._location, bq_client=self._clients_provider.bqclient, - bq_connection_manager=self._clients_provider.bqconnectionclient, + bq_connection_manager=bigframes.clients.BqConnectionManager( + self._clients_provider.bqconnectionclient, + self._clients_provider.resourcemanagerclient, + ), cloud_functions_client=self._clients_provider.cloudfunctionsclient, publisher=self._publisher, ), diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index 9cf2234a8111..2c1b02c1957a 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -543,15 +543,17 @@ async def _deploy_undeployed_udfs( def _collect_udf_defs(self, plan: nodes.BigFrameNode) -> list[udf_def.PythonUdf]: udf_defs: list[udf_def.PythonUdf] = [] - for node in plan.unique_nodes(): - for expr in node._node_expressions: - for sub_expr in expr.walk(): - if isinstance(sub_expr, expression.OpExpression): - op = sub_expr.op - if isinstance(op, ops.PythonUdfOp): - func_def = op.function_def - if isinstance(func_def, udf_def.PythonUdf): - udf_defs.append(func_def) + exprs = [ + expr for node in plan.unique_nodes() for expr in node._node_expressions + ] + expr_nodes = [expr for expr in exprs for expr in expr.walk()] + for expr_node in expr_nodes: + if ( + isinstance(expr_node, expression.OpExpression) + and isinstance(expr_node.op, ops.PythonUdfOp) + and isinstance(expr_node.op.function_def, udf_def.PythonUdf) + ): + udf_defs.append(expr_node.op.function_def) return udf_defs def _subsitute_temporary_functions( @@ -559,22 +561,20 @@ def _subsitute_temporary_functions( plan: nodes.BigFrameNode, deployed_mapping: dict[udf_def.PythonUdf, udf_def.BigqueryUdf], ) -> nodes.BigFrameNode: + def replace_udf_expr(e: expression.Expression) -> expression.Expression: + if isinstance(e, expression.OpExpression) and isinstance( + e.op, ops.PythonUdfOp + ): + func_def = e.op.function_def + # We will have already deployed the function + assert func_def in deployed_mapping + deployed_func = deployed_mapping[func_def] + rf_op = ops.RemoteFunctionOp(function_def=deployed_func) + return dataclasses.replace(e, op=rf_op) + return e + def replace_in_expr(expr: expression.Expression) -> expression.Expression: - def replace_step(e: expression.Expression) -> expression.Expression: - if isinstance(e, expression.OpExpression): - op = e.op - if isinstance(op, ops.PythonUdfOp): - func_def = op.function_def - if func_def in deployed_mapping: - deployed_func = deployed_mapping[func_def] - rf_op = ops.RemoteFunctionOp(function_def=deployed_func) - return dataclasses.replace(e, op=rf_op) - raise ValueError( - f"UDF definition {func_def} not found in deployed mapping" - ) - return e - - return expr.bottom_up(replace_step) + return expr.bottom_up(replace_udf_expr) def replace_in_node(node: nodes.BigFrameNode) -> nodes.BigFrameNode: if hasattr(node, "transform_exprs"): From 683f4ab8ae64ad5b54e559b18c01d774ce2bcd2c Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 27 May 2026 21:15:12 +0000 Subject: [PATCH 16/21] del scratch --- packages/bigframes/scratch/.gitignore | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 packages/bigframes/scratch/.gitignore diff --git a/packages/bigframes/scratch/.gitignore b/packages/bigframes/scratch/.gitignore deleted file mode 100644 index b813ccd98e6a..000000000000 --- a/packages/bigframes/scratch/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# Ignore all files in this directory. -* From ea2595a874f06d8cdca2ebedc91cda81a2e3528e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 27 May 2026 23:56:50 +0000 Subject: [PATCH 17/21] ruff fixes --- .../bigframes/functions/_function_client.py | 15 ++++----------- .../bigframes/functions/_function_session.py | 9 +-------- .../bigframes/bigframes/functions/function.py | 2 -- .../bigframes/session/bq_caching_executor.py | 2 -- .../tests/unit/functions/test_remote_function.py | 1 - 5 files changed, 5 insertions(+), 24 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_client.py b/packages/bigframes/bigframes/functions/_function_client.py index 0f766f260f12..024443a8247d 100644 --- a/packages/bigframes/bigframes/functions/_function_client.py +++ b/packages/bigframes/bigframes/functions/_function_client.py @@ -17,30 +17,23 @@ import logging import os -import random import re import shutil -import string import tempfile import textwrap import types import warnings -from typing import TYPE_CHECKING, Any, Optional, cast +from typing import Any, cast +import google.api_core.exceptions +import google.api_core.retry import requests +from google.cloud import bigquery, functions_v2 import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting import bigframes.functions.function_template as bff_template import bigframes.functions.udf_def as udf_def - -if TYPE_CHECKING: - from bigframes.session import Session - -import google.api_core.exceptions -import google.api_core.retry -from google.cloud import bigquery, functions_v2 - from bigframes.functions import _utils logger = logging.getLogger(__name__) diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 9d88acbfcf4d..124364b75d74 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -28,21 +28,15 @@ from typing import ( TYPE_CHECKING, Any, - Dict, Literal, Mapping, Optional, Sequence, Union, - cast, ) -import google.api_core.exceptions from google.cloud import ( bigquery, - bigquery_connection_v1, - functions_v2, - resourcemanager_v3, ) import bigframes.exceptions as bfe @@ -57,7 +51,7 @@ ) if TYPE_CHECKING: - from bigframes.session import Session, anonymous_dataset + from bigframes.session import anonymous_dataset _DEFAULT_FUNCTION_MEMORY_MIB = 1024 @@ -473,7 +467,6 @@ def remote_function( # BQ remote function must be persisted, for which we need a dataset. # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. dataset_ref = self._resolve_dataset_reference(dataset) - cloud_function_region = _utils.gcf_location_from_bq_location(self._location) # A connection is required for BQ remote function. # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function bq_connection_id = self._resolve_bigquery_connection_id( diff --git a/packages/bigframes/bigframes/functions/function.py b/packages/bigframes/bigframes/functions/function.py index 242c9a850d13..b3a56dafcef6 100644 --- a/packages/bigframes/bigframes/functions/function.py +++ b/packages/bigframes/bigframes/functions/function.py @@ -26,8 +26,6 @@ from bigframes.functions import function_typing, udf_def if TYPE_CHECKING: - import bigframes.core.col - import bigframes.series from bigframes.session import Session logger = logging.getLogger(__name__) diff --git a/packages/bigframes/bigframes/session/bq_caching_executor.py b/packages/bigframes/bigframes/session/bq_caching_executor.py index 2c1b02c1957a..d7f228b1bc1f 100644 --- a/packages/bigframes/bigframes/session/bq_caching_executor.py +++ b/packages/bigframes/bigframes/session/bq_caching_executor.py @@ -43,7 +43,6 @@ import bigframes.session.metrics import bigframes.session.planner import bigframes.session.temporary_storage -from bigframes._config import ComputeOptions from bigframes.core import ( compile, expression, @@ -527,7 +526,6 @@ async def _deploy_undeployed_udfs( self, plan: nodes.BigFrameNode ) -> nodes.BigFrameNode: referenced_udfs = list(set(self._collect_udf_defs(plan))) - session = self.loader._session deployed_mapping: dict[udf_def.PythonUdf, udf_def.BigqueryUdf] = {} tasks = [ asyncio.to_thread( diff --git a/packages/bigframes/tests/unit/functions/test_remote_function.py b/packages/bigframes/tests/unit/functions/test_remote_function.py index c38f9bde64f2..19de301790d6 100644 --- a/packages/bigframes/tests/unit/functions/test_remote_function.py +++ b/packages/bigframes/tests/unit/functions/test_remote_function.py @@ -14,7 +14,6 @@ import pytest -import bigframes.functions.function as bff from bigframes.testing import mocks From 0880407322e0f6a978de4f4868df894009077a06 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 28 May 2026 00:01:29 +0000 Subject: [PATCH 18/21] fix test_method_matches_session --- packages/bigframes/bigframes/pandas/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/bigframes/bigframes/pandas/__init__.py b/packages/bigframes/bigframes/pandas/__init__.py index 082a00438f42..b88816ab5ab2 100644 --- a/packages/bigframes/bigframes/pandas/__init__.py +++ b/packages/bigframes/bigframes/pandas/__init__.py @@ -200,7 +200,7 @@ def udf( *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, - dataset: str, + dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, From bf8d8389fae31c1aef45fe6f2f97ef033773c527 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 28 May 2026 01:03:38 +0000 Subject: [PATCH 19/21] fix small tests and connection check handling --- .../bigframes/functions/_function_client.py | 7 +- .../bigframes/functions/_function_session.py | 3 + .../bigframes/bigframes/session/__init__.py | 1 + .../small/functions/test_remote_function.py | 101 +++--------------- 4 files changed, 22 insertions(+), 90 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_client.py b/packages/bigframes/bigframes/functions/_function_client.py index 024443a8247d..81c0c089a4c8 100644 --- a/packages/bigframes/bigframes/functions/_function_client.py +++ b/packages/bigframes/bigframes/functions/_function_client.py @@ -61,9 +61,6 @@ class FunctionClient: - # Wait time (in seconds) for an IAM binding to take effect after creation. - _iam_wait_seconds = 120 - # TODO(b/392707725): Convert all necessary parameters for cloud function # deployment into method parameters. def __init__( @@ -151,6 +148,7 @@ def create_bq_remote_function( routine_ref: bigquery.RoutineReference, udf_def: udf_def.RemoteFunctionConfig, maybe_reuse: bool, + try_create_connection: bool, ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" @@ -161,7 +159,8 @@ def create_bq_remote_function( logger.info(f"Remote function {str(routine_ref)} already exists.") return - self._create_bq_connection(udf_def.connection_id, routine_ref.project) + if try_create_connection: + self._create_bq_connection(udf_def.connection_id, routine_ref.project) # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 124364b75d74..0f06e9b78c8f 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -70,6 +70,7 @@ def __init__( default_connection: str, location: str, session_id: str, + manage_connections: bool, ): self._temp_cloud_functions: set[str] = set() self._temp_remote_functions: set[bigquery.RoutineReference] = set() @@ -87,6 +88,7 @@ def __init__( self._default_connection: str = default_connection self._location: str = location self._session_id: str = session_id + self._manage_connections: bool = manage_connections @property def session_id(self) -> str: @@ -619,6 +621,7 @@ def wrapper(func): udf_def=remote_function_config, routine_ref=routine_ref, maybe_reuse=reuse, + try_create_connection=self._manage_connections, ) udf_definition = udf_def.BigqueryUdf( diff --git a/packages/bigframes/bigframes/session/__init__.py b/packages/bigframes/bigframes/session/__init__.py index c6b84d773575..bbe27b6a795a 100644 --- a/packages/bigframes/bigframes/session/__init__.py +++ b/packages/bigframes/bigframes/session/__init__.py @@ -316,6 +316,7 @@ def __init__( default_connection=self._bq_connection, location=self._location, session_id=self._session_id, + manage_connections=not self._skip_bq_connection_check, ) # Session temp tables don't support specifying kms key, so use anon dataset if kms key specified self._session_resource_manager = ( diff --git a/packages/bigframes/tests/system/small/functions/test_remote_function.py b/packages/bigframes/tests/system/small/functions/test_remote_function.py index eb0593e1ab6d..a970fab64db3 100644 --- a/packages/bigframes/tests/system/small/functions/test_remote_function.py +++ b/packages/bigframes/tests/system/small/functions/test_remote_function.py @@ -118,10 +118,6 @@ def get_bq_connection_id_path_format(connection_id_dot_format): @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, scalars_dfs, dataset_id_permanent, bq_cf_connection, @@ -132,10 +128,6 @@ def square(x): square = bff.remote_function( input_types=int, output_type=int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. @@ -186,10 +178,9 @@ def test_remote_function_connection_w_location( def square(x): return x * x - square = bff.remote_function( + square = session.remote_function( input_types=int, output_type=int, - session=session, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location, # See e2e tests for tests that actually deploy the Cloud Function. @@ -255,10 +246,9 @@ def square(x): "The location does not match BigQuery connection location:" ), ): - bff.remote_function( + session.remote_function( input_types=int, output_type=int, - session=session, dataset=dataset_id_permanent, bigquery_connection=connection_id, # See e2e tests for tests that actually deploy the Cloud Function. @@ -278,10 +268,9 @@ def test_remote_function_connection_w_location_project( def square(x): return x * x - square = bff.remote_function( + square = session.remote_function( input_types=int, output_type=int, - session=session, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_project, # See e2e tests for tests that actually deploy the Cloud Function. @@ -349,10 +338,9 @@ def square(x): "The project_id does not match BigQuery connection gcp_project_id:" ), ): - bff.remote_function( + session.remote_function( input_types=int, output_type=int, - session=session, dataset=dataset_id_permanent, bigquery_connection=connection_id, # See e2e tests for tests that actually deploy the Cloud Function. @@ -362,49 +350,6 @@ def square(x): )(square) -@pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_session_param( - session_with_bq_connection, scalars_dfs, dataset_id_permanent -): - def square(x): - return x * x - - square = bff.remote_function( - input_types=int, - output_type=int, - session=session_with_bq_connection, - dataset=dataset_id_permanent, - name=get_function_name(square), - cloud_function_service_account="default", - )(square) - - # Function should still work normally. - assert square(2) == 4 - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] - bf_int64_col_filter = bf_int64_col.notnull() - bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] - bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_int64_col_filter = pd_int64_col.notnull() - pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] - pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) - # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. - # pd_int64_col_filtered.dtype is Int64Dtype() - # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. - # For this test let's force the pandas dtype to be same as bigframes' dtype. - pd_result_col = pd_result_col.astype(pd.Int64Dtype()) - pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - - assert_frame_equal(bf_result, pd_result) - - @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_via_session_default( session_with_bq_connection, scalars_dfs, dataset_id_permanent @@ -681,9 +626,8 @@ def add_one(x): def test_read_gbq_function_detects_invalid_function(session, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) with pytest.raises(ValueError) as e: - bff.read_gbq_function( + session.read_gbq_function( str(dataset_ref.routine("not_a_function")), - session=session, ) assert "Unknown function" in str(e.value) @@ -692,10 +636,6 @@ def test_read_gbq_function_detects_invalid_function(session, dataset_id): @pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_like_original( session, - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, scalars_df_index, dataset_id_permanent, bq_cf_connection, @@ -706,11 +646,7 @@ def square1(x): square1 = bff.remote_function( input_types=[int], output_type=int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, dataset=dataset_id_permanent, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, bigquery_connection=bq_cf_connection, reuse=True, name=get_function_name(square1), @@ -720,9 +656,8 @@ def square1(x): # Function should still work normally. assert square1(2) == 4 - square2 = bff.read_gbq_function( + square2 = session.read_gbq_function( function_name=square1.bigframes_bigquery_function, # type: ignore - session=session, ) # The newly-created function (square1) should have a remote function AND a @@ -922,9 +857,8 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): for routine in (sql_routine, js_routine): # Create the routine in BigQuery and read it back using read_gbq_function. bigquery_client.create_routine(routine, exists_ok=True) - square = bff.read_gbq_function( + square = session.read_gbq_function( str(routine.reference), - session=session, ) # It should point to the named routine and yield the expected results. @@ -998,27 +932,23 @@ def test_read_gbq_function_requires_explicit_types( bigquery_client.create_routine(only_arg_type_specified, exists_ok=True) bigquery_client.create_routine(neither_type_specified, exists_ok=True) - bff.read_gbq_function( + session.read_gbq_function( str(both_types_specified.reference), - session=session, ) with pytest.warns( bigframes.exceptions.UnknownDataTypeWarning, match=r"missing input data types[\s\S]*assume default data type", ): - bff.read_gbq_function( + session.read_gbq_function( str(only_return_type_specified.reference), - session=session, ) with pytest.raises(ValueError): - bff.read_gbq_function( + session.read_gbq_function( str(only_arg_type_specified.reference), - session=session, ) with pytest.raises(ValueError): - bff.read_gbq_function( + session.read_gbq_function( str(neither_type_specified.reference), - session=session, ) @@ -1063,7 +993,7 @@ def test_read_gbq_function_respects_python_output_type( # Create the routine in BigQuery and read it back using read_gbq_function. bigquery_client.create_routine(sql_routine, exists_ok=True) - func = bff.read_gbq_function(str(sql_routine.reference), session=session) + func = session.read_gbq_function(str(sql_routine.reference)) # test that the function works as expected s = bigframes.series.Series([1, 10, 100]) @@ -1109,7 +1039,7 @@ def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( TypeError, match="An explicit output_type should be provided only for a BigQuery function with STRING output.", ): - bff.read_gbq_function(str(sql_routine.reference), session=session) + session.read_gbq_function(str(sql_routine.reference)) @pytest.mark.parametrize( @@ -1140,7 +1070,7 @@ def test_read_gbq_function_supported_python_output_type( # Create the routine in BigQuery and read it back using read_gbq_function. bigquery_client.create_routine(sql_routine, exists_ok=True) - bff.read_gbq_function(str(sql_routine.reference), session=session) + session.read_gbq_function(str(sql_routine.reference)) @pytest.mark.flaky(retries=2, delay=120) @@ -1658,10 +1588,9 @@ def func_tuple(x): ValueError, match=r"must be one of the supported types", ): - bff.remote_function( + session.remote_function( input_types=int, output_type=Sequence[int], - session=session, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection, reuse=True, From 476d1db2a6edc8b2d06b14773018b83aa6a72f10 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 28 May 2026 17:54:57 +0000 Subject: [PATCH 20/21] read scratch gitignore --- packages/bigframes/scratch/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 packages/bigframes/scratch/.gitignore diff --git a/packages/bigframes/scratch/.gitignore b/packages/bigframes/scratch/.gitignore new file mode 100644 index 000000000000..b813ccd98e6a --- /dev/null +++ b/packages/bigframes/scratch/.gitignore @@ -0,0 +1,2 @@ +# Ignore all files in this directory. +* From 1b72f0abd6930076ccdee5da32ebef937520bec0 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 28 May 2026 19:20:16 +0000 Subject: [PATCH 21/21] fixes --- .../bigframes/functions/_function_session.py | 14 +++++++------- packages/bigframes/bigframes/operations/to_op.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/bigframes/bigframes/functions/_function_session.py b/packages/bigframes/bigframes/functions/_function_session.py index 0f06e9b78c8f..e369b0b39bfd 100644 --- a/packages/bigframes/bigframes/functions/_function_session.py +++ b/packages/bigframes/bigframes/functions/_function_session.py @@ -674,13 +674,13 @@ def deploy_remote_function( def udf( self, - input_types: Union[None, type, Sequence[type]] = None, - output_type: Optional[type] = None, - dataset: Optional[str] = None, - bigquery_connection: Optional[str] = None, - name: Optional[str] = None, - packages: Optional[Sequence[str]] = None, - max_batching_rows: Optional[int] = None, + input_types: type | Sequence[type] | None = None, + output_type: type | None = None, + dataset: str | None = None, + bigquery_connection: str | None = None, + name: str | None = None, + packages: Sequence[str] | None = None, + max_batching_rows: int | None = None, container_cpu: Optional[float] = None, container_memory: Optional[str] = None, *, diff --git a/packages/bigframes/bigframes/operations/to_op.py b/packages/bigframes/bigframes/operations/to_op.py index 0abddf18ec7d..7fd44d957e40 100644 --- a/packages/bigframes/bigframes/operations/to_op.py +++ b/packages/bigframes/bigframes/operations/to_op.py @@ -31,7 +31,7 @@ def func_to_op(op) -> base_ops.NaryOp: Returns: A bigframes operations. """ - # TODO: Handle numpy ufuncs, builtin functions, etc. + # TODO(b/517578802): Handle numpy ufuncs, builtin functions, etc. if isinstance(op, Udf): if isinstance(op.udf_def, BigqueryUdf): return remote_function_ops.RemoteFunctionOp(function_def=op.udf_def)