Skip to content

Advanced Usage

This guide covers advanced topics including custom parser steps, custom spec adapters, and extending anyschema for your specific use cases.

It might be useful to review the Architecture and have gone through the Getting Started guide before diving into advanced topics.

Custom Parser Steps

Creating custom parser steps allows you to add support for new type systems or handle special types in your own way. Parser steps should inherit from the ParserStep base class and implement the parse method with the following signature:

def parse(self, input_type: Any, metadata: tuple = ()) -> DType | None:
    ...

Basic Custom Parser

Here's a simple custom parser for a hypothetical custom type:

from typing import Any

import narwhals as nw
from anyschema.parsers import make_pipeline, ParserStep, PyTypeStep


class Color:
    """A custom type representing a color."""

    pass


class ColorStep(ParserStep):
    """Parser for Color types."""

    def parse(self, input_type: Any, metadata: tuple = ()) -> nw.dtypes.DType | None:
        """Parse Color to String dtype.

        Arguments:
            input_type: The type to parse.
            metadata: Optional metadata associated with the type.

        Returns:
            String dtype for Color types, None otherwise.
        """
        if input_type is Color:
            return nw.String()
        return None


# Create a simple pipeline with the custom parser
color_step = ColorStep()
python_step = PyTypeStep()
pipeline = make_pipeline(steps=[color_step, python_step])

result = pipeline.parse(Color)
print(result)
String

Custom Parser with Nested Types

This example shows how to handle a custom generic type. Note how we use self.pipeline.parse() for recursion, as explained in the Architecture page:

from typing import Any, TypeVar, get_args, get_origin

import narwhals as nw
from anyschema.parsers import ParserStep

T = TypeVar("T")

class MyList[T]:
    """A custom list-like type."""

    pass


class MyListStep(ParserStep):
    """Parser for MyList[T] generic types."""

    def parse(self, input_type: Any, metadata: tuple = ()) -> nw.dtypes.DType | None:
        """Parse MyList[T] to List dtype.

        This parser handles custom generic types by recursively parsing
        the inner type through the pipeline.
        """
        origin = get_origin(input_type)

        if origin is MyList:
            # Get the inner type (e.g., T in MyList[T])
            args = get_args(input_type)
            if args:
                inner_type = args[0]
                # Recursively parse the inner type
                inner_dtype = self.pipeline.parse(inner_type, metadata=metadata)
                return nw.List(inner_dtype)
            else:
                # MyList without type parameter
                return nw.List(nw.Object())

        return None

my_list_step = MyListStep()
python_step = PyTypeStep()
pipeline = make_pipeline(steps=[my_list_step, python_step])
result = pipeline.parse(MyList[int])
print(result)
List(Int64)

Custom Parser with Metadata Handling

This example shows how to use metadata to refine type parsing. For more on metadata flow, see the Architecture section:

from typing import Any, Annotated

import narwhals as nw
from anyschema.parsers import AnnotatedStep, ParserStep, PyTypeStep


class SmallInt:
    """Marker for small integers."""

    pass


class BigInt:
    """Marker for big integers."""

    pass


class CustomConstraintStep(ParserStep):
    """Parser that uses metadata to choose integer size."""

    def parse(self, input_type: Any, metadata: tuple = ()) -> nw.dtypes.DType | None:
        """Parse integers with size constraints.

        Uses metadata to determine whether to use Int32 or Int64.
        """
        if input_type is int and metadata:
            for item in metadata:
                if item is SmallInt:
                    return nw.Int32()
                elif item is BigInt:
                    return nw.Int64()

        return None


# Usage with typing.Annotated
SmallInteger = Annotated[int, SmallInt]
BigInteger = Annotated[int, BigInt]

# Create a pipeline with the custom parser
annotated_step = AnnotatedStep()
custom_constraint_step = CustomConstraintStep()
python_step = PyTypeStep()
pipeline = make_pipeline(steps=[annotated_step, custom_constraint_step, python_step])

print(f"SmallInteger dtype: {pipeline.parse(SmallInteger)}")
print(f"BigInteger dtype: {pipeline.parse(BigInteger)}")
SmallInteger dtype: Int32
BigInteger dtype: Int64

Combining Multiple Custom Parsers

Here's how to combine multiple custom parsers:

from anyschema import AnySchema
from anyschema.parsers import (
    make_pipeline,
    ParserPipeline,
    ForwardRefStep,
    UnionTypeStep,
    AnnotatedStep,
    PyTypeStep,
)

# Create pipeline with custom parsers
custom_pipeline = make_pipeline(
    steps=[
        ForwardRefStep(),
        UnionTypeStep(),
        AnnotatedStep(),
        ColorStep(),  # Our custom parsers
        MyListStep(),  # before the fallback
        PyTypeStep(),  # Fallback
    ]
)

# Use the custom pipeline
schema = AnySchema(
    spec={"color": Color, "items": MyList[int]},
    steps=custom_pipeline.steps,
)
print(schema.to_arrow())
color: string
items: list<item: int64>
  child 0, item: int64

Custom Spec Adapters

Custom adapters allow you to convert from any specification format to anyschema's internal format. Adapters need to follow the Adapter signature described in the API reference.

Basic Custom Adapter

Here's a simple adapter for a custom schema format:

from typing import TypedDict
from anyschema import AnySchema
from anyschema.typing import FieldSpecIterable


class CustomFieldSpec(TypedDict):
    """Field specification in the custom schema format."""

    name: str
    type: type


class SimpleSchema:
    """A simple schema format."""

    def __init__(self, fields: list[CustomFieldSpec]) -> None:
        self.fields = fields


def simple_schema_adapter(spec: SimpleSchema) -> FieldSpecIterable:
    """Adapter for SimpleSchema format.

    Arguments:
        spec: A SimpleSchema instance.

    Yields:
        Tuples of (field_name, field_type, metadata).
    """
    for field in spec.fields:
        yield field["name"], field["type"], ()


schema_spec = SimpleSchema(
    fields=[
        {"name": "id", "type": int},
        {"name": "name", "type": str},
    ]
)

schema = AnySchema(spec=schema_spec, adapter=simple_schema_adapter)
print(schema.to_arrow())
id: int64
name: string

Adapter with Metadata Conversion

This example shows how to convert schema metadata to anyschema metadata:

from typing import Annotated
from anyschema import AnySchema
from anyschema.typing import FieldSpecIterable


class FieldWithConstraints:
    """A field with type and constraints."""

    def __init__(
        self,
        name: str,
        type_: type,
        min_val: int | None = None,
        max_val: int | None = None,
    ):
        self.name = name
        self.type = type_
        self.min_val = min_val
        self.max_val = max_val


class SchemaWithConstraints:
    """A schema format that includes constraints."""

    def __init__(self, fields: list[FieldWithConstraints]) -> None:
        self.fields = fields


def constrained_adapter(spec: SchemaWithConstraints) -> FieldSpecIterable:
    """Adapter that converts constraints to metadata.

    Arguments:
        spec: A SchemaWithConstraints instance.

    Yields:
        Tuples of (field_name, field_type, metadata).
    """
    for field in spec.fields:
        metadata = []

        if field.min_val is not None:
            metadata.append(("min", field.min_val))
        if field.max_val is not None:
            metadata.append(("max", field.max_val))

        yield field.name, field.type, tuple(metadata)


schema_spec = SchemaWithConstraints(
    fields=[
        FieldWithConstraints("age", int, min_val=0, max_val=120),
        FieldWithConstraints("name", str),
    ]
)

schema = AnySchema(spec=schema_spec, adapter=constrained_adapter)
print(schema.to_arrow())
age: int64
name: string

Notice that we don't have a parser step to handle the metadata in this example. You would need to implement one if you want to process custom metadata. See an example in the dedicated Custom Parser Steps section.

Adapter for Nested Structures

Handle nested schemas with a recursive adapter by dynamically creating TypedDict classes:

from typing import Any, TypedDict
from anyschema import AnySchema
from anyschema.typing import FieldSpecIterable


class NestedSchema:
    """A schema that can contain nested schemas."""

    def __init__(self, fields: dict[str, Any]) -> None:
        self.fields = fields


def nested_adapter(spec: NestedSchema) -> FieldSpecIterable:
    """Adapter for nested schema structures.

    For nested schemas, we dynamically create a TypedDict so the parser
    can properly extract the field structure.

    Arguments:
        spec: A NestedSchema instance.

    Yields:
        Tuples of (field_name, field_type, metadata).
    """
    for field_name, field_value in spec.fields.items():
        if isinstance(field_value, NestedSchema):
            # For nested schemas, create a TypedDict with the proper structure
            nested_dict = {
                name: type_ for name, type_, _ in nested_adapter(field_value)
            }
            # Create a dynamic TypedDict with the nested fields
            nested_typed_dict = TypedDict(
                f"{field_name.title()}TypedDict",  # Generate a unique name
                nested_dict,  # Field name -> type mapping
            )
            yield field_name, nested_typed_dict, ()
        else:
            yield field_name, field_value, ()


schema_spec = NestedSchema(
    fields={
        "id": int,
        "profile": NestedSchema(
            fields={
                "name": str,
                "age": int,
            }
        ),
    }
)
schema = AnySchema(spec=schema_spec, adapter=nested_adapter)
print(schema.to_arrow())
id: int64
profile: struct<name: string, age: int64>
  child 0, name: string
  child 1, age: int64

Adapter for JSON Schema

Here's a practical example of adapting from JSON Schema:

import json
from anyschema import AnySchema
from anyschema.typing import FieldSpecIterable


def json_schema_adapter(spec: str) -> FieldSpecIterable:
    """Adapter for JSON Schema format.

    Arguments:
        spec: A JSON Schema with "type": "object" and "properties".

    Yields:
        Tuples of (field_name, field_type, metadata).
    """
    spec = json.loads(spec)
    if spec.get("type") != "object":
        raise ValueError("Only object types supported")

    properties = spec.get("properties", {})
    required = set(spec.get("required", []))

    type_mapping = {
        "string": str,
        "integer": int,
        "number": float,
        "boolean": bool,
        "array": list,
        "object": dict,
    }

    for field_name, field_spec in properties.items():
        json_type = field_spec.get("type")
        python_type = type_mapping.get(json_type, object)

        # Handle optional fields
        if field_name not in required:
            python_type = python_type | None

        # Handle array types
        if json_type == "array" and "items" in field_spec:
            item_type = type_mapping.get(field_spec["items"].get("type"), object)
            python_type = list[item_type]

        yield field_name, python_type, ()


json_schema = json.dumps(
    {
        "type": "object",
        "properties": {
            "id": {"type": "integer"},
            "name": {"type": "string"},
            "tags": {"type": "array", "items": {"type": "string"}},
            "email": {"type": "string"},
        },
        "required": ["id", "name"],
    }
)

schema = AnySchema(spec=json_schema, adapter=json_schema_adapter)
print(schema.to_arrow())
id: int64
name: string
tags: list<item: string>
  child 0, item: string
email: string