Skip to Content

Workflows

Recipes for the three most common Document Intelligence use cases.

RAG ingestion pipeline

Parse a document, embed the sanitized chunks, and store them in a vector database. Use safe-parse so no PII lands in your vector index.

import time import httpx API_KEY = "pk_live_abc123" BASE_URL = "https://api.expunct.ai" headers = {"X-API-Key": API_KEY} # 1. Submit safe-parse job with open("legal_brief.pdf", "rb") as f: r = httpx.post( f"{BASE_URL}/api/v1/workflows/safe-parse", headers=headers, files={"file": ("legal_brief.pdf", f, "application/pdf")}, data={"language": "en"}, ) job_id = r.json()["id"] # 2. Poll until done while True: r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers) job = r.json() if job["status"] == "completed": break elif job["status"] == "failed": raise RuntimeError(job["error_message"]) time.sleep(2) # 3. Find the sanitized chunks artifact chunks_artifact = next( a for a in job["artifacts"] if a["artifact_kind"] == "sanitized_chunks_v1" ) # 4. Fetch the chunk content r = httpx.get( f"{BASE_URL}/api/v1/documents/{chunks_artifact['id']}/content", headers=headers, ) chunk_set = r.json() # 5. Embed and store for chunk in chunk_set["chunks"]: embedding = embed(chunk["text"]) # your embedding function vector_db.upsert(chunk["chunk_id"], embedding, metadata={ "text": chunk["text"], "page": chunk["page_number"], "source": "legal_brief.pdf", })

Compliance pipeline

Run safe-parse on incoming documents to produce an audit-ready sanitized copy, then verify redaction coverage before archiving.

import time import httpx API_KEY = "pk_live_abc123" BASE_URL = "https://api.expunct.ai" headers = {"X-API-Key": API_KEY} def safe_parse_and_verify(file_path: str, policy_id: str | None = None) -> dict: """Parse + sanitize a document. Returns sanitized artifact IDs.""" data = {"language": "en"} if policy_id: data["policy_id"] = policy_id with open(file_path, "rb") as f: r = httpx.post( f"{BASE_URL}/api/v1/workflows/safe-parse", headers=headers, files={"file": (file_path, f, "application/pdf")}, data=data, ) r.raise_for_status() job_id = r.json()["id"] # Poll until done while True: r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers) job = r.json() if job["status"] == "completed": break elif job["status"] == "failed": raise RuntimeError(f"safe-parse failed: {job['error_message']}") time.sleep(2) artifacts = {a["artifact_kind"]: a["id"] for a in job["artifacts"]} # Fetch the sanitized canonical doc and check for any remaining PII markers r = httpx.get( f"{BASE_URL}/api/v1/documents/{artifacts['sanitized_canonical_document']}/content", headers=headers, ) doc = r.json() total_blocks = sum(len(page["blocks"]) for page in doc["pages"]) redacted_blocks = sum( 1 for page in doc["pages"] for block in page["blocks"] if "[" in block["text"] and "]" in block["text"] ) print(f"Redaction coverage: {redacted_blocks}/{total_blocks} blocks contain redaction labels") return artifacts # Example artifacts = safe_parse_and_verify("patient_intake.pdf", policy_id="pol_hipaa_strict") print(f"Sanitized doc: {artifacts['sanitized_canonical_document']}") print(f"Sanitized markdown: {artifacts['sanitized_markdown_render']}")

Invoice extraction

Extract structured fields from a batch of invoices using the built-in invoice template. Parse once, extract fields, then post-process the results.

import time import httpx API_KEY = "pk_live_abc123" BASE_URL = "https://api.expunct.ai" headers = {"X-API-Key": API_KEY} def extract_invoice(file_path: str) -> dict: """Extract invoice fields. Returns the extract_result content.""" with open(file_path, "rb") as f: r = httpx.post( f"{BASE_URL}/api/v1/extract", headers=headers, files={"file": (file_path, f, "application/pdf")}, data={"template_id": "invoice"}, ) r.raise_for_status() job_id = r.json()["id"] while True: r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers) job = r.json() if job["status"] == "completed": break elif job["status"] == "failed": raise RuntimeError(job["error_message"]) time.sleep(2) result_artifact = next( a for a in job["artifacts"] if a["artifact_kind"] == "extract_result" ) r = httpx.get( f"{BASE_URL}/api/v1/documents/{result_artifact['id']}/content", headers=headers, ) return r.json() # Process a batch invoices = ["inv_001.pdf", "inv_002.pdf", "inv_003.pdf"] for invoice_file in invoices: result = extract_invoice(invoice_file) # Build a dict of high-confidence field values fields = { f["field_name"]: f["value"] for f in result["fields"] if f["confidence"] >= 0.7 # discard low-confidence extractions } print(f"{invoice_file}:") print(f" Invoice #: {fields.get('invoice_number', 'N/A')}") print(f" Total: {fields.get('total_amount', 'N/A')}") print(f" Vendor: {fields.get('vendor_name', 'N/A')}") if result["validation_errors"]: print(f" Warnings: {result['validation_errors']}")

Reuse a parse artifact for multiple extractions

If you need to extract different field sets from the same document, parse once and extract multiple times without re-uploading the file.

import time import httpx API_KEY = "pk_live_abc123" BASE_URL = "https://api.expunct.ai" headers = {"X-API-Key": API_KEY} def poll_job(job_id: str) -> dict: while True: r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers) job = r.json() if job["status"] in ("completed", "failed"): return job time.sleep(2) # Step 1: Parse the document once with open("contract.pdf", "rb") as f: r = httpx.post( f"{BASE_URL}/api/v1/parse", headers=headers, files={"file": ("contract.pdf", f, "application/pdf")}, ) parse_job = poll_job(r.json()["id"]) canonical_id = next( a["id"] for a in parse_job["artifacts"] if a["artifact_kind"] == "canonical_document" ) # Step 2a: Extract with the invoice template r = httpx.post( f"{BASE_URL}/api/v1/extract", headers=headers, data={"parse_artifact_id": canonical_id, "template_id": "invoice"}, ) invoice_job = poll_job(r.json()["id"]) # Step 2b: Extract with a custom schema (no re-upload) custom_schema = '{"type":"object","properties":{"governing_law":{"type":"string"},"termination_clause":{"type":"string"}}}' r = httpx.post( f"{BASE_URL}/api/v1/extract", headers=headers, data={"parse_artifact_id": canonical_id, "extraction_schema": custom_schema}, ) clauses_job = poll_job(r.json()["id"]) print("Invoice extraction:", invoice_job["status"]) print("Clause extraction:", clauses_job["status"])