Python API Examples
Example 1: Document Chunking with Context
Split long documents into chunks, add surrounding context, then extract structured information:
import docetl
docetl.default_model = "gpt-4o-mini"
df = (
docetl.read_json("papers.json")
.split(
split_key="full_text",
method="delimiter",
method_kwargs={"delimiter": "\n\n", "num_splits_to_group": 2},
)
.gather(
content_key="full_text_chunk",
doc_id_key="split_0_id",
order_key="split_0_chunk_num",
peripheral_chunks={
"previous": {"head": {"count": 1}},
"next": {"head": {"count": 1}},
},
)
.map(
prompt="""Analyze this paper section with its surrounding context:
Paper: {{ input.title }}
Section: {{ input.full_text_chunk_rendered }}
Extract the section type, key findings, and technical concepts.""",
output={"schema": {
"section_type": "str",
"key_findings": "list[str]",
"technical_concepts": "list[str]",
}},
)
.reduce(
reduce_key="paper_id",
prompt="""Create a comprehensive analysis of this paper:
{% for section in inputs %}
{{ section.section_type }}: {{ section.key_findings | join(", ") }}
{% endfor %}""",
output={"schema": {
"summary": "str",
"main_contributions": "list[str]",
}},
)
.collect()
)
print(df)
Example 2: Fuzzy Aggregation with the Pandas Accessor
The pandas .semantic accessor runs operations on existing DataFrames:
import pandas as pd
import docetl
docetl.default_model = "gpt-4o-mini"
posts = pd.DataFrame({
"text": [
"Just tried the new iPhone 15!",
"Having issues with iOS 17",
"Android is way better",
],
"timestamp": ["2024-01-01", "2024-01-02", "2024-01-03"],
})
# Extract structured data
analyzed = posts.semantic.map(
prompt="""Extract product and sentiment from: {{ input.text }}""",
output={"schema": {"product": "str", "sentiment": "str"}},
)
# Filter
relevant = analyzed.semantic.filter(
prompt="Is this about Apple products? {{ input }}"
)
# Fuzzy group-by and summarize
summaries = relevant.semantic.agg(
fuzzy=True,
reduce_keys=["product"],
comparison_prompt="Do these posts discuss the same product?",
reduce_prompt="Summarize the feedback about this product",
output={"schema": {"summary": "str", "frequency": "int"}},
)
print(f"Cost: ${summaries.semantic.total_cost:.4f}")
print(summaries)
Datasets can be JSON, CSV, or Parquet.