Workflows
Recipes for the three most common Document Intelligence use cases.
RAG ingestion pipeline
Parse a document, embed the sanitized chunks, and store them in a vector database. Use safe-parse so no PII lands in your vector index.
Python
import time
import httpx
API_KEY = "pk_live_abc123"
BASE_URL = "https://api.expunct.ai"
headers = {"X-API-Key": API_KEY}
# 1. Submit safe-parse job
with open("legal_brief.pdf", "rb") as f:
r = httpx.post(
f"{BASE_URL}/api/v1/workflows/safe-parse",
headers=headers,
files={"file": ("legal_brief.pdf", f, "application/pdf")},
data={"language": "en"},
)
job_id = r.json()["id"]
# 2. Poll until done
while True:
r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers)
job = r.json()
if job["status"] == "completed":
break
elif job["status"] == "failed":
raise RuntimeError(job["error_message"])
time.sleep(2)
# 3. Find the sanitized chunks artifact
chunks_artifact = next(
a for a in job["artifacts"] if a["artifact_kind"] == "sanitized_chunks_v1"
)
# 4. Fetch the chunk content
r = httpx.get(
f"{BASE_URL}/api/v1/documents/{chunks_artifact['id']}/content",
headers=headers,
)
chunk_set = r.json()
# 5. Embed and store
for chunk in chunk_set["chunks"]:
embedding = embed(chunk["text"]) # your embedding function
vector_db.upsert(chunk["chunk_id"], embedding, metadata={
"text": chunk["text"],
"page": chunk["page_number"],
"source": "legal_brief.pdf",
})Compliance pipeline
Run safe-parse on incoming documents to produce an audit-ready sanitized copy, then verify redaction coverage before archiving.
Python
import time
import httpx
API_KEY = "pk_live_abc123"
BASE_URL = "https://api.expunct.ai"
headers = {"X-API-Key": API_KEY}
def safe_parse_and_verify(file_path: str, policy_id: str | None = None) -> dict:
"""Parse + sanitize a document. Returns sanitized artifact IDs."""
data = {"language": "en"}
if policy_id:
data["policy_id"] = policy_id
with open(file_path, "rb") as f:
r = httpx.post(
f"{BASE_URL}/api/v1/workflows/safe-parse",
headers=headers,
files={"file": (file_path, f, "application/pdf")},
data=data,
)
r.raise_for_status()
job_id = r.json()["id"]
# Poll until done
while True:
r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers)
job = r.json()
if job["status"] == "completed":
break
elif job["status"] == "failed":
raise RuntimeError(f"safe-parse failed: {job['error_message']}")
time.sleep(2)
artifacts = {a["artifact_kind"]: a["id"] for a in job["artifacts"]}
# Fetch the sanitized canonical doc and check for any remaining PII markers
r = httpx.get(
f"{BASE_URL}/api/v1/documents/{artifacts['sanitized_canonical_document']}/content",
headers=headers,
)
doc = r.json()
total_blocks = sum(len(page["blocks"]) for page in doc["pages"])
redacted_blocks = sum(
1
for page in doc["pages"]
for block in page["blocks"]
if "[" in block["text"] and "]" in block["text"]
)
print(f"Redaction coverage: {redacted_blocks}/{total_blocks} blocks contain redaction labels")
return artifacts
# Example
artifacts = safe_parse_and_verify("patient_intake.pdf", policy_id="pol_hipaa_strict")
print(f"Sanitized doc: {artifacts['sanitized_canonical_document']}")
print(f"Sanitized markdown: {artifacts['sanitized_markdown_render']}")Invoice extraction
Extract structured fields from a batch of invoices using the built-in invoice template. Parse once, extract fields, then post-process the results.
Python
import time
import httpx
API_KEY = "pk_live_abc123"
BASE_URL = "https://api.expunct.ai"
headers = {"X-API-Key": API_KEY}
def extract_invoice(file_path: str) -> dict:
"""Extract invoice fields. Returns the extract_result content."""
with open(file_path, "rb") as f:
r = httpx.post(
f"{BASE_URL}/api/v1/extract",
headers=headers,
files={"file": (file_path, f, "application/pdf")},
data={"template_id": "invoice"},
)
r.raise_for_status()
job_id = r.json()["id"]
while True:
r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers)
job = r.json()
if job["status"] == "completed":
break
elif job["status"] == "failed":
raise RuntimeError(job["error_message"])
time.sleep(2)
result_artifact = next(
a for a in job["artifacts"] if a["artifact_kind"] == "extract_result"
)
r = httpx.get(
f"{BASE_URL}/api/v1/documents/{result_artifact['id']}/content",
headers=headers,
)
return r.json()
# Process a batch
invoices = ["inv_001.pdf", "inv_002.pdf", "inv_003.pdf"]
for invoice_file in invoices:
result = extract_invoice(invoice_file)
# Build a dict of high-confidence field values
fields = {
f["field_name"]: f["value"]
for f in result["fields"]
if f["confidence"] >= 0.7 # discard low-confidence extractions
}
print(f"{invoice_file}:")
print(f" Invoice #: {fields.get('invoice_number', 'N/A')}")
print(f" Total: {fields.get('total_amount', 'N/A')}")
print(f" Vendor: {fields.get('vendor_name', 'N/A')}")
if result["validation_errors"]:
print(f" Warnings: {result['validation_errors']}")Reuse a parse artifact for multiple extractions
If you need to extract different field sets from the same document, parse once and extract multiple times without re-uploading the file.
Python
import time
import httpx
API_KEY = "pk_live_abc123"
BASE_URL = "https://api.expunct.ai"
headers = {"X-API-Key": API_KEY}
def poll_job(job_id: str) -> dict:
while True:
r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers)
job = r.json()
if job["status"] in ("completed", "failed"):
return job
time.sleep(2)
# Step 1: Parse the document once
with open("contract.pdf", "rb") as f:
r = httpx.post(
f"{BASE_URL}/api/v1/parse",
headers=headers,
files={"file": ("contract.pdf", f, "application/pdf")},
)
parse_job = poll_job(r.json()["id"])
canonical_id = next(
a["id"] for a in parse_job["artifacts"]
if a["artifact_kind"] == "canonical_document"
)
# Step 2a: Extract with the invoice template
r = httpx.post(
f"{BASE_URL}/api/v1/extract",
headers=headers,
data={"parse_artifact_id": canonical_id, "template_id": "invoice"},
)
invoice_job = poll_job(r.json()["id"])
# Step 2b: Extract with a custom schema (no re-upload)
custom_schema = '{"type":"object","properties":{"governing_law":{"type":"string"},"termination_clause":{"type":"string"}}}'
r = httpx.post(
f"{BASE_URL}/api/v1/extract",
headers=headers,
data={"parse_artifact_id": canonical_id, "extraction_schema": custom_schema},
)
clauses_job = poll_job(r.json()["id"])
print("Invoice extraction:", invoice_job["status"])
print("Clause extraction:", clauses_job["status"])