Advanced Usage¶

This guide covers advanced topics including custom parser steps, custom spec adapters, and extending anyschema for your specific use cases.

It might be useful to review the Architecture and have gone through the Getting Started guide before diving into advanced topics.

Custom Parser Steps¶

Creating custom parser steps allows you to add support for new type systems or handle special types in your own way. Parser steps should inherit from the ParserStep base class and implement the parse method with the following signature:

import narwhals as nw
from anyschema.typing import FieldConstraints, FieldMetadata, FieldType


def parse(
    self,
    input_type: FieldType,
    constraints: FieldConstraints,
    metadata: FieldMetadata,
) -> nw.dtypes.DType | None:
    ...

Basic Custom Parser¶

Here's a simple custom parser for a hypothetical custom type:

import narwhals as nw
from anyschema.parsers import ParserPipeline, ParserStep, PyTypeStep
from anyschema.typing import FieldConstraints, FieldMetadata, FieldType


class Color:
    """A custom type representing a color."""

    pass


class ColorStep(ParserStep):
    """Parser for Color types."""

    def parse(
        self,
        input_type: FieldType,
        constraints: FieldConstraints,
        metadata: FieldMetadata,
    ) -> nw.dtypes.DType | None:
        """Parse Color to String dtype.

        Arguments:
            input_type: The type to parse.
            constraints: Constraints associated with the type.
            metadata: Custom metadata dictionary.

        Returns:
            String dtype for Color types, None otherwise.
        """
        if input_type is Color:
            return nw.String()
        return None


# Create a simple pipeline with the custom parser
color_step = ColorStep()
python_step = PyTypeStep()
pipeline = ParserPipeline(steps=[color_step, python_step])

result = pipeline.parse(Color, constraints=(), metadata={})
print(result)

String

Custom Parser with Nested Types¶

This example shows how to handle a custom generic type. Note how we use self.pipeline.parse(..., constraints=constraints, metadata=metadata) for recursion, as explained in the Architecture page:

from typing import Any, TypeVar, get_args, get_origin

import narwhals as nw
from anyschema.parsers import ParserStep

from anyschema.typing import FieldConstraints, FieldMetadata, FieldType

T = TypeVar("T")

class MyList[T]:
    """A custom list-like type."""

    pass


class MyListStep(ParserStep):
    """Parser for MyList[T] generic types."""

    def parse(
        self,
        input_type: FieldType,
        constraints: FieldConstraints,
        metadata: FieldMetadata,
    ) -> nw.dtypes.DType | None:
        """Parse MyList[T] to List dtype.

        This parser handles custom generic types by recursively parsing
        the inner type through the pipeline.
        """
        origin = get_origin(input_type)

        if origin is MyList:
            # Get the inner type (e.g., T in MyList[T])
            args = get_args(input_type)
            if args:
                inner_type = args[0]
                # Recursively parse the inner type
                inner_dtype = self.pipeline.parse(inner_type, constraints=constraints, metadata=metadata)
                return nw.List(inner_dtype)
            else:
                # MyList without type parameter
                return nw.List(nw.Object())

        return None

my_list_step = MyListStep()
python_step = PyTypeStep()
pipeline = ParserPipeline(steps=[my_list_step, python_step])
result = pipeline.parse(MyList[int], (), {})
print(result)

List(Int64)

Custom Parser with Metadata Handling¶

This example shows how to use metadata to refine type parsing. For more on metadata flow, see the Architecture section:

from typing import Any, Annotated

import narwhals as nw
from anyschema.parsers import AnnotatedStep, ParserStep, PyTypeStep
from anyschema.typing import FieldConstraints, FieldMetadata, FieldType


class SmallInt:
    """Marker for small integers."""

    pass


class BigInt:
    """Marker for big integers."""

    pass


class CustomConstraintStep(ParserStep):
    """Parser that uses constraints to choose integer size."""

    def parse(
        self,
        input_type: FieldType,
        constraints: FieldConstraints,
        metadata: FieldMetadata,
    ) -> nw.dtypes.DType | None:
        """Parse integers with size constraints.

        Uses constraints to determine whether to use Int32 or Int64.
        """
        if input_type is int and constraints:
            for constraint in constraints:
                if constraint is SmallInt:
                    return nw.Int32()
                elif constraint is BigInt:
                    return nw.Int64()

        return None


# Usage with typing.Annotated
SmallInteger = Annotated[int, SmallInt]
BigInteger = Annotated[int, BigInt]

# Create a pipeline with the custom parser
annotated_step = AnnotatedStep()
custom_constraint_step = CustomConstraintStep()
python_step = PyTypeStep()
pipeline = ParserPipeline(steps=[annotated_step, custom_constraint_step, python_step])

print(f"SmallInteger dtype: {pipeline.parse(SmallInteger, (), {})}")
print(f"BigInteger dtype: {pipeline.parse(BigInteger, (), {})}")

SmallInteger dtype: Int32
BigInteger dtype: Int64

Combining Multiple Custom Parsers¶

Here's how to combine multiple custom parsers using the with_steps method for easy pipeline extension:

from anyschema import AnySchema
from anyschema.parsers import ParserPipeline

base_pipeline = ParserPipeline("auto")  # Start with the auto pipeline

# Add custom parsers using with_steps (automatically positions them optimally)
custom_pipeline = base_pipeline.with_steps(ColorStep(), MyListStep())

# Use the custom pipeline
schema = AnySchema(
    spec={"color": Color, "items": MyList[int]},
    pipeline=custom_pipeline,
)
print(schema.to_arrow())

color: string not null
items: list<item: int64> not null
  child 0, item: int64

The with_steps method makes it easy to extend existing pipelines without reconstructing them from scratch. By default, it inserts custom steps right after the last preprocessing step found (trying AnnotatedStep, UnionTypeStep, ForwardRefStep in that order), ensuring they run after type preprocessing but before library-specific parsers.

You can also specify a position explicitly:

pipeline_at_start = base_pipeline.with_steps(ColorStep(), at_position=0)
pipeline_at_end = base_pipeline.with_steps(ColorStep(), at_position=-1)

print(pipeline_at_start.steps)
print(pipeline_at_end.steps)

(ColorStep, ForwardRefStep, UnionTypeStep, AnnotatedStep, AnnotatedTypesStep, AttrsTypeStep, PydanticTypeStep, SQLAlchemyTypeStep, PyTypeStep)
(ForwardRefStep, UnionTypeStep, AnnotatedStep, AnnotatedTypesStep, AttrsTypeStep, PydanticTypeStep, SQLAlchemyTypeStep, ColorStep, PyTypeStep)

Why use with_steps?

As the list of default steps grows, it becomes less practical to redefine a list of step just to add one or few custom parsing steps. With pipeline.with_steps, you can simply extend an existing pipeline:

custom_pipeline = ParserPipeline("auto").with_steps([ColorStep(), MyListStep()])

This approach:

Automatically includes all library-specific parsers based on installed dependencies.
Positions your custom parsers either after the preprocessing steps or positionally.

Custom Spec Adapters¶

Custom adapters allow you to convert from any specification format to anyschema's internal format. Adapters need to follow the Adapter signature described in the API reference.

Basic Custom Adapter¶

Here's a simple adapter for a custom schema format:

from typing import TypedDict
from anyschema import AnySchema
from anyschema.typing import FieldSpecIterable


class CustomFieldSpec(TypedDict):
    """Field specification in the custom schema format."""

    name: str
    type: type


class SimpleSchema:
    """A simple schema format."""

    def __init__(self, fields: list[CustomFieldSpec]) -> None:
        self.fields = fields


def simple_schema_adapter(spec: SimpleSchema) -> FieldSpecIterable:
    """Adapter for SimpleSchema format.

    Arguments:
        spec: A SimpleSchema instance.

    Yields:
        Tuples of (field_name, field_type, constraints, metadata).
    """
    for field in spec.fields:
        yield field["name"], field["type"], (), {}


schema_spec = SimpleSchema(
    fields=[
        {"name": "id", "type": int},
        {"name": "name", "type": str},
    ]
)

schema = AnySchema(spec=schema_spec, adapter=simple_schema_adapter)
print(schema.to_arrow())

id: int64 not null
name: string not null

Adapter with Metadata Conversion¶

This example shows how to convert schema metadata to anyschema metadata:

from typing import Annotated
from anyschema import AnySchema
from anyschema.typing import FieldSpecIterable


class FieldWithConstraints:
    """A field with type and constraints."""

    def __init__(
        self,
        name: str,
        type_: type,
        min_val: int | None = None,
        max_val: int | None = None,
    ):
        self.name = name
        self.type = type_
        self.min_val = min_val
        self.max_val = max_val


class SchemaWithConstraints:
    """A schema format that includes constraints."""

    def __init__(self, fields: list[FieldWithConstraints]) -> None:
        self.fields = fields


def constrained_adapter(spec: SchemaWithConstraints) -> FieldSpecIterable:
    """Adapter that converts constraints to the constraints tuple.

    Arguments:
        spec: A SchemaWithConstraints instance.

    Yields:
        Tuples of (field_name, field_type, constraints, metadata).
    """
    for field in spec.fields:
        constraints = []

        if field.min_val is not None:
            constraints.append(("min", field.min_val))
        if field.max_val is not None:
            constraints.append(("max", field.max_val))

        yield field.name, field.type, tuple(constraints), {}


schema_spec = SchemaWithConstraints(
    fields=[
        FieldWithConstraints("age", int, min_val=0, max_val=120),
        FieldWithConstraints("name", str),
    ]
)

schema = AnySchema(spec=schema_spec, adapter=constrained_adapter)
print(schema.to_arrow())

age: int64 not null
name: string not null

Notice that we don't have a parser step to handle the metadata in this example. You would need to implement one if you want to process custom metadata. See an example in the dedicated Custom Parser Steps section.

Adapter for Nested Structures¶

Handle nested schemas with a recursive adapter by dynamically creating TypedDict classes:

from typing import Any, TypedDict
from anyschema import AnySchema
from anyschema.typing import FieldSpecIterable


class NestedSchema:
    """A schema that can contain nested schemas."""

    def __init__(self, fields: dict[str, Any]) -> None:
        self.fields = fields


def nested_adapter(spec: NestedSchema) -> FieldSpecIterable:
    """Adapter for nested schema structures.

    For nested schemas, we dynamically create a TypedDict so the parser
    can properly extract the field structure.

    Arguments:
        spec: A NestedSchema instance.

    Yields:
        Tuples of (field_name, field_type, constraints, metadata).
    """
    for field_name, field_value in spec.fields.items():
        if isinstance(field_value, NestedSchema):
            # For nested schemas, create a TypedDict with the proper structure
            nested_dict = {
                name: type_ for name, type_, _, _ in nested_adapter(field_value)
            }
            # Create a dynamic TypedDict with the nested fields
            nested_typed_dict = TypedDict(
                f"{field_name.title()}TypedDict",  # Generate a unique name
                nested_dict,  # Field name -> type mapping
            )
            yield field_name, nested_typed_dict, (), {}
        else:
            yield field_name, field_value, (), {}


schema_spec = NestedSchema(
    fields={
        "id": int,
        "profile": NestedSchema(
            fields={
                "name": str,
                "age": int,
            }
        ),
    }
)
schema = AnySchema(spec=schema_spec, adapter=nested_adapter)
print(schema.to_arrow())

id: int64 not null
profile: struct<name: string, age: int64> not null
  child 0, name: string
  child 1, age: int64

Adapter for JSON Schema¶

Here's a practical example of adapting from JSON Schema:

import json
from anyschema import AnySchema
from anyschema.typing import FieldSpecIterable


def json_schema_adapter(spec: str) -> FieldSpecIterable:
    """Adapter for JSON Schema format.

    Arguments:
        spec: A JSON Schema with "type": "object" and "properties".

    Yields:
        Tuples of (field_name, field_type, constraints, metadata).
    """
    spec = json.loads(spec)
    if spec.get("type") != "object":
        raise ValueError("Only object types supported")

    properties = spec.get("properties", {})
    required = set(spec.get("required", []))

    type_mapping = {
        "string": str,
        "integer": int,
        "number": float,
        "boolean": bool,
        "array": list,
        "object": dict,
    }

    for field_name, field_spec in properties.items():
        json_type = field_spec.get("type")
        python_type = type_mapping.get(json_type, object)

        # Handle optional fields
        if field_name not in required:
            python_type = python_type | None

        # Handle array types
        if json_type == "array" and "items" in field_spec:
            item_type = type_mapping.get(field_spec["items"].get("type"), object)
            python_type = list[item_type]

        yield field_name, python_type, (), {}


json_schema = json.dumps(
    {
        "type": "object",
        "properties": {
            "id": {"type": "integer"},
            "name": {"type": "string"},
            "tags": {"type": "array", "items": {"type": "string"}},
            "email": {"type": "string"},
        },
        "required": ["id", "name"],
    }
)

schema = AnySchema(spec=json_schema, adapter=json_schema_adapter)
print(schema.to_arrow())

id: int64 not null
name: string not null
tags: list<item: string> not null
  child 0, item: string
email: string