Best Practices¶
For Custom Parsers¶
- Return
Nonewhen you can't handle a type to let other parsers in the chain try. - Use
self.pipeline.parse()for recursion. This possibly allows to handle nested types by delegating to the pipeline. - Preserve metadata: Pass metadata through when recursively parsing.
- Order matters: Place specialized parsers before general ones.
- Document what types the parser can handle: Make it clear in docstrings.
from typing import Any, TypeVar, get_args, get_origin
import narwhals as nw
from anyschema.parsers import ParserStep
T = TypeVar("T")
class CustomType: ...
class CustomList[T]: ...
class GoodParserStep(ParserStep):
"""Parser for CustomType.
Handles:
- CustomType: converts to String
- CustomList[T]: converts to List(T)
"""
def parse(self, input_type: Any, metadata: tuple = ()) -> nw.dtypes.DType | None:
# Check if we can handle this type
if input_type is CustomType:
return nw.String()
# Handle generic version
if get_origin(input_type) is CustomList:
inner = get_args(input_type)[0]
# Delegate to pipeline for recursion
inner_dtype = self.pipeline.parse(inner, metadata=metadata)
return nw.List(inner_dtype)
# Return None if we can't handle it
return None
For Custom Adapters¶
- Use generators: Yield instead of returning a list for memory efficiency.
- Handle nested structures: Recursively convert nested schemas.
- Validate input: Check that the spec is the expected format.
- Convert metadata consistently: Have a clear mapping from your format to anyschema metadata.
- Document the expected input format: Make it clear what spec format you accept.
from typing import Any, TypedDict
from anyschema.typing import FieldSpecIterable
class CustomSchemaSpec:
def __init__(self, fields: dict[str, Any]) -> None:
self.fields = fields
def good_adapter(spec: CustomSchemaSpec) -> FieldSpecIterable:
"""Adapter for CustomSchemaSpec structures.
For nested schemas, we dynamically create a TypedDict so the parser
can properly extract the field structure.
Arguments:
spec: A CustomSchemaSpec instance.
Yields:
Tuples of (field_name, field_type, metadata).
Raises:
TypeError: If spec is not a CustomSchemaSpec instance.
"""
if not isinstance(spec, CustomSchemaSpec):
raise TypeError(f"Expected `CustomSchemaSpec`, got {type(spec)}")
for field_name, field_value in spec.fields.items():
if isinstance(field_value, CustomSchemaSpec):
# For nested schemas, create a TypedDict with the proper structure
nested_dict = {name: type_ for name, type_, _ in good_adapter(field_value)}
# Create a dynamic TypedDict with the nested fields
nested_typed_dict = TypedDict(
f"{field_name.title()}TypedDict",
nested_dict,
)
yield field_name, nested_typed_dict, ()
else:
yield field_name, field_value, ()
Integration Testing¶
Test your custom components thoroughly at multiple levels: unit tests for individual parsers and adapters, and integration tests for the complete flow.
import polars as pl
import pytest
from anyschema import AnySchema
from anyschema.parsers import make_pipeline, ParserPipeline, PyTypeStep
@pytest.fixture(scope="module")
def custom_step() -> GoodParserStep:
custom_step = GoodParserStep()
python_step = PyTypeStep()
_ = make_pipeline([custom_step, python_step])
return custom_step
@pytest.mark.parametrize(
("input_type", "expected_dtype"),
[
(CustomType, nw.String()),
(CustomList[int], nw.List(nw.Int64())),
(str, None),
],
)
def test_custom_step_parse(
custom_step: GoodParserStep, input_type: Any, expected_dtype: nw.dtypes.DType
) -> None:
"""Test that custom parser handles its types correctly."""
result = custom_step.parse(CustomType)
assert result == expected_dtype
def test_custom_adapter() -> None:
"""Test that custom adapter converts spec correctly."""
fields = {
"id": int,
"name": str,
}
spec = CustomSchemaSpec(fields)
result = list(good_adapter(spec))
assert len(result) == len(fields)
expected = [
("id", int, ()),
("name", str, ()),
]
assert result == expected
def test_custom_adapter_nested() -> None:
"""Test that custom adapter handles nested schemas."""
inner_fields = {
"name": str,
"age": int,
}
fields = {
"id": int,
"profile": CustomSchemaSpec(fields=inner_fields),
}
spec = CustomSchemaSpec(fields=fields)
result = list(good_adapter(spec))
assert len(result) == len(fields)
assert result[0] == ("id", int, ())
# Check that nested field is a TypedDict
assert result[1][0] == "profile"
assert hasattr(result[1][1], "__annotations__")
def test_custom_components_integration():
"""Test custom parser and adapter working together end-to-end."""
schema_spec = CustomSchemaSpec(
fields={
"custom_field": CustomType,
"custom_list": CustomList[int],
"name": str,
}
)
schema = AnySchema(
spec=schema_spec,
steps=[GoodParserStep(), PyTypeStep()],
adapter=good_adapter,
)
# Verify the conversion to Arrow works correctly
scehma_pa = schema.to_arrow()
assert scehma_pa.names == ["custom_field", "custom_list", "name"]
# Verify types are converted correctly
schema_pl = schema.to_polars()
expected_pl = pl.Schema(
{
"custom_field": pl.String(),
"custom_list": pl.List(pl.Int64()),
"name": pl.String(),
},
)
assert schema_pl == expected_pl