import asyncio
import contextvars
import ctypes
import functools
import json
import time
import traceback
from concurrent.futures import ThreadPoolExecutor
from enum import Enum
from multiprocessing import Queue
from threading import Thread, current_thread
from typing import Any, Callable, Dict, cast
from pydantic import BaseModel
from hatchet_sdk.client import new_client_raw
from hatchet_sdk.clients.admin import new_admin
from hatchet_sdk.clients.dispatcher.action_listener import Action
from hatchet_sdk.clients.dispatcher.dispatcher import new_dispatcher
from hatchet_sdk.clients.run_event_listener import new_listener
from hatchet_sdk.clients.workflow_listener import PooledWorkflowRunListener
from hatchet_sdk.context import Context # type: ignore[attr-defined]
from hatchet_sdk.context.worker_context import WorkerContext
from hatchet_sdk.contracts.dispatcher_pb2 import (
GROUP_KEY_EVENT_TYPE_COMPLETED,
GROUP_KEY_EVENT_TYPE_FAILED,
GROUP_KEY_EVENT_TYPE_STARTED,
STEP_EVENT_TYPE_COMPLETED,
STEP_EVENT_TYPE_FAILED,
STEP_EVENT_TYPE_STARTED,
ActionType,
)
from hatchet_sdk.loader import ClientConfig
from hatchet_sdk.logger import logger
from hatchet_sdk.utils.types import WorkflowValidator
from hatchet_sdk.v2.callable import DurableContext
from hatchet_sdk.worker.action_listener_process import ActionEvent
from hatchet_sdk.worker.runner.utils.capture_logs import copy_context_vars, sr, wr
class WorkerStatus(Enum):
INITIALIZED = 1
STARTING = 2
HEALTHY = 3
UNHEALTHY = 4
class Runner:
def __init__(
self,
name: str,
event_queue: "Queue[Any]",
max_runs: int | None = None,
handle_kill: bool = True,
action_registry: dict[str, Callable[..., Any]] = {},
validator_registry: dict[str, WorkflowValidator] = {},
config: ClientConfig = ClientConfig(),
labels: dict[str, str | int] = {},
):
# We store the config so we can dynamically create clients for the dispatcher client.
self.config = config
self.client = new_client_raw(config)
self.name = self.client.config.namespace + name
self.max_runs = max_runs
self.tasks: dict[str, asyncio.Task[Any]] = {} # Store run ids and futures
self.contexts: dict[str, Context] = {} # Store run ids and contexts
self.action_registry: dict[str, Callable[..., Any]] = action_registry
self.validator_registry = validator_registry
self.event_queue = event_queue
# The thread pool is used for synchronous functions which need to run concurrently
self.thread_pool = ThreadPoolExecutor(max_workers=max_runs)
self.threads: Dict[str, Thread] = {} # Store run ids and threads
self.killing = False
self.handle_kill = handle_kill
# We need to initialize a new admin and dispatcher client *after* we've started the event loop,
# otherwise the grpc.aio methods will use a different event loop and we'll get a bunch of errors.
self.dispatcher_client = new_dispatcher(self.config)
self.admin_client = new_admin(self.config)
self.workflow_run_event_listener = new_listener(self.config)
self.client.workflow_listener = PooledWorkflowRunListener(self.config)
self.worker_context = WorkerContext(
labels=labels, client=new_client_raw(config).dispatcher
)
def create_workflow_run_url(self, action: Action) -> str:
return f"{self.config.server_url}/workflow-runs/{action.workflow_run_id}?tenant={action.tenant_id}"
def run(self, action: Action) -> None:
if self.worker_context.id() is None:
self.worker_context._worker_id = action.worker_id
match action.action_type:
case ActionType.START_STEP_RUN:
log = f"run: start step: {action.action_id}/{action.step_run_id}"
logger.info(log)
asyncio.create_task(self.handle_start_step_run(action))
case ActionType.CANCEL_STEP_RUN:
log = f"cancel: step run: {action.action_id}/{action.step_run_id}"
logger.info(log)
asyncio.create_task(self.handle_cancel_action(action.step_run_id))
case ActionType.START_GET_GROUP_KEY:
log = f"run: get group key: {action.action_id}/{action.get_group_key_run_id}"
logger.info(log)
asyncio.create_task(self.handle_start_group_key_run(action))
case _:
log = f"unknown action type: {action.action_type}"
logger.error(log)
def step_run_callback(self, action: Action) -> Callable[[asyncio.Task[Any]], None]:
def inner_callback(task: asyncio.Task[Any]) -> None:
self.cleanup_run_id(action.step_run_id)
errored = False
cancelled = task.cancelled()
# Get the output from the future
try:
if not cancelled:
output = task.result()
except Exception as e:
errored = True
# This except is coming from the application itself, so we want to send that to the Hatchet instance
self.event_queue.put(
ActionEvent(
action=action,
type=STEP_EVENT_TYPE_FAILED,
payload=str(errorWithTraceback(f"{e}", e)),
)
)
logger.error(
f"failed step run: {action.action_id}/{action.step_run_id}"
)
if not errored and not cancelled:
self.event_queue.put(
ActionEvent(
action=action,
type=STEP_EVENT_TYPE_COMPLETED,
payload=self.serialize_output(output),
)
)
logger.info(
f"finished step run: {action.action_id}/{action.step_run_id}"
)
return inner_callback
def group_key_run_callback(
self, action: Action
) -> Callable[[asyncio.Task[Any]], None]:
def inner_callback(task: asyncio.Task[Any]) -> None:
self.cleanup_run_id(action.get_group_key_run_id)
errored = False
cancelled = task.cancelled()
# Get the output from the future
try:
if not cancelled:
output = task.result()
except Exception as e:
errored = True
self.event_queue.put(
ActionEvent(
action=action,
type=GROUP_KEY_EVENT_TYPE_FAILED,
payload=str(errorWithTraceback(f"{e}", e)),
)
)
logger.error(
f"failed step run: {action.action_id}/{action.step_run_id}"
)
if not errored and not cancelled:
self.event_queue.put(
ActionEvent(
action=action,
type=GROUP_KEY_EVENT_TYPE_COMPLETED,
payload=self.serialize_output(output),
)
)
logger.info(
f"finished step run: {action.action_id}/{action.step_run_id}"
)
return inner_callback
## TODO: Stricter type hinting here
def thread_action_func(
self, context: Context, action_func: Callable[..., Any], action: Action
) -> Any:
if action.step_run_id is not None and action.step_run_id != "":
self.threads[action.step_run_id] = current_thread()
elif (
action.get_group_key_run_id is not None
and action.get_group_key_run_id != ""
):
self.threads[action.get_group_key_run_id] = current_thread()
return action_func(context)
## TODO: Stricter type hinting here
# We wrap all actions in an async func
async def async_wrapped_action_func(
self,
context: Context,
action_func: Callable[..., Any],
action: Action,
run_id: str,
) -> Any:
wr.set(context.workflow_run_id())
sr.set(context.step_run_id)
try:
if (
hasattr(action_func, "is_coroutine") and action_func.is_coroutine
) or asyncio.iscoroutinefunction(action_func):
return await action_func(context)
else:
pfunc = functools.partial(
# we must copy the context vars to the new thread, as only asyncio natively supports
# contextvars
copy_context_vars,
contextvars.copy_context().items(),
self.thread_action_func,
context,
action_func,
action,
)
loop = asyncio.get_event_loop()
return await loop.run_in_executor(self.thread_pool, pfunc)
except Exception as e:
logger.error(
errorWithTraceback(
f"exception raised in action ({action.action_id}, retry={action.retry_count}):\n{e}",
e,
)
)
raise e
finally:
self.cleanup_run_id(run_id)
def cleanup_run_id(self, run_id: str | None) -> None:
if run_id in self.tasks:
del self.tasks[run_id]
if run_id in self.threads:
del self.threads[run_id]
if run_id in self.contexts:
del self.contexts[run_id]
def create_context(
self, action: Action, action_func: Callable[..., Any] | None
) -> Context | DurableContext:
if hasattr(action_func, "durable") and getattr(action_func, "durable"):
return DurableContext(
action,
self.dispatcher_client,
self.admin_client,
self.client.event,
self.client.rest,
self.client.workflow_listener,
self.workflow_run_event_listener,
self.worker_context,
self.client.config.namespace,
validator_registry=self.validator_registry,
)
return Context(
action,
self.dispatcher_client,
self.admin_client,
self.client.event,
self.client.rest,
self.client.workflow_listener,
self.workflow_run_event_listener,
self.worker_context,
self.client.config.namespace,
validator_registry=self.validator_registry,
)
## IMPORTANT: Keep this method's signature in sync with the wrapper in the OTel instrumentor
async def handle_start_step_run(self, action: Action) -> None | Exception:
action_name = action.action_id
# Find the corresponding action function from the registry
action_func = self.action_registry.get(action_name)
context = self.create_context(action, action_func)
self.contexts[action.step_run_id] = context
if action_func:
self.event_queue.put(
ActionEvent(
action=action,
type=STEP_EVENT_TYPE_STARTED,
)
)
loop = asyncio.get_event_loop()
task = loop.create_task(
self.async_wrapped_action_func(
context, action_func, action, action.step_run_id
)
)
task.add_done_callback(self.step_run_callback(action))
self.tasks[action.step_run_id] = task
try:
await task
except Exception as e:
return e
return None
## IMPORTANT: Keep this method's signature in sync with the wrapper in the OTel instrumentor
async def handle_start_group_key_run(self, action: Action) -> Exception | None:
action_name = action.action_id
context = Context(
action,
self.dispatcher_client,
self.admin_client,
self.client.event,
self.client.rest,
self.client.workflow_listener,
self.workflow_run_event_listener,
self.worker_context,
self.client.config.namespace,
)
self.contexts[action.get_group_key_run_id] = context
# Find the corresponding action function from the registry
action_func = self.action_registry.get(action_name)
if action_func:
# send an event that the group key run has started
self.event_queue.put(
ActionEvent(
action=action,
type=GROUP_KEY_EVENT_TYPE_STARTED,
)
)
loop = asyncio.get_event_loop()
task = loop.create_task(
self.async_wrapped_action_func(
context, action_func, action, action.get_group_key_run_id
)
)
task.add_done_callback(self.group_key_run_callback(action))
self.tasks[action.get_group_key_run_id] = task
try:
await task
except Exception as e:
return e
return None
def force_kill_thread(self, thread: Thread) -> None:
"""Terminate a python threading.Thread."""
try:
if not thread.is_alive():
return
ident = cast(int, thread.ident)
logger.info(f"Forcefully terminating thread {ident}")
exc = ctypes.py_object(SystemExit)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(ident), exc)
if res == 0:
raise ValueError("Invalid thread ID")
elif res != 1:
logger.error("PyThreadState_SetAsyncExc failed")
# Call with exception set to 0 is needed to cleanup properly.
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
raise SystemError("PyThreadState_SetAsyncExc failed")
logger.info(f"Successfully terminated thread {ident}")
# Immediately add a new thread to the thread pool, because we've actually killed a worker
# in the ThreadPoolExecutor
self.thread_pool.submit(lambda: None)
except Exception as e:
logger.exception(f"Failed to terminate thread: {e}")
## IMPORTANT: Keep this method's signature in sync with the wrapper in the OTel instrumentor
async def handle_cancel_action(self, run_id: str) -> None:
try:
# call cancel to signal the context to stop
if run_id in self.contexts:
context = self.contexts.get(run_id)
if context:
context.cancel()
await asyncio.sleep(1)
if run_id in self.tasks:
future = self.tasks.get(run_id)
if future:
future.cancel()
# check if thread is still running, if so, print a warning
if run_id in self.threads:
thread = self.threads.get(run_id)
if thread and self.client.config.enable_force_kill_sync_threads:
self.force_kill_thread(thread)
await asyncio.sleep(1)
logger.warning(
f"Thread {self.threads[run_id].ident} with run id {run_id} is still running after cancellation. This could cause the thread pool to get blocked and prevent new tasks from running."
)
finally:
self.cleanup_run_id(run_id)
def serialize_output(self, output: Any) -> str:
if isinstance(output, BaseModel):
return output.model_dump_json()
if output is not None:
try:
return json.dumps(output)
except Exception as e:
logger.error(f"Could not serialize output: {e}")
return str(output)
return ""
async def wait_for_tasks(self) -> None:
running = len(self.tasks.keys())
while running > 0:
logger.info(f"waiting for {running} tasks to finish...")
await asyncio.sleep(1)
running = len(self.tasks.keys())
def errorWithTraceback(message: str, e: Exception) -> str:
trace = "".join(traceback.format_exception(type(e), e, e.__traceback__))
return f"{message}\n{trace}"