Workflows

Recipes for the three most common Document Intelligence use cases.

RAG ingestion pipeline

Parse a document, embed the sanitized chunks, and store them in a vector database. Use safe-parse so no PII lands in your vector index.

Python


import time
import httpx
 
API_KEY = "pk_live_abc123"
BASE_URL = "https://api.expunct.ai"
headers = {"X-API-Key": API_KEY}
 
# 1. Submit safe-parse job
with open("legal_brief.pdf", "rb") as f:
    r = httpx.post(
        f"{BASE_URL}/api/v1/workflows/safe-parse",
        headers=headers,
        files={"file": ("legal_brief.pdf", f, "application/pdf")},
        data={"language": "en"},
    )
job_id = r.json()["id"]
 
# 2. Poll until done
while True:
    r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers)
    job = r.json()
    if job["status"] == "completed":
        break
    elif job["status"] == "failed":
        raise RuntimeError(job["error_message"])
    time.sleep(2)
 
# 3. Find the sanitized chunks artifact
chunks_artifact = next(
    a for a in job["artifacts"] if a["artifact_kind"] == "sanitized_chunks_v1"
)
 
# 4. Fetch the chunk content
r = httpx.get(
    f"{BASE_URL}/api/v1/documents/{chunks_artifact['id']}/content",
    headers=headers,
)
chunk_set = r.json()
 
# 5. Embed and store
for chunk in chunk_set["chunks"]:
    embedding = embed(chunk["text"])          # your embedding function
    vector_db.upsert(chunk["chunk_id"], embedding, metadata={
        "text": chunk["text"],
        "page": chunk["page_number"],
        "source": "legal_brief.pdf",
    })

Compliance pipeline

Run safe-parse on incoming documents to produce an audit-ready sanitized copy, then verify redaction coverage before archiving.

Python


import time
import httpx
 
API_KEY = "pk_live_abc123"
BASE_URL = "https://api.expunct.ai"
headers = {"X-API-Key": API_KEY}
 
def safe_parse_and_verify(file_path: str, policy_id: str | None = None) -> dict:
    """Parse + sanitize a document. Returns sanitized artifact IDs."""
 
    data = {"language": "en"}
    if policy_id:
        data["policy_id"] = policy_id
 
    with open(file_path, "rb") as f:
        r = httpx.post(
            f"{BASE_URL}/api/v1/workflows/safe-parse",
            headers=headers,
            files={"file": (file_path, f, "application/pdf")},
            data=data,
        )
    r.raise_for_status()
    job_id = r.json()["id"]
 
    # Poll until done
    while True:
        r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers)
        job = r.json()
        if job["status"] == "completed":
            break
        elif job["status"] == "failed":
            raise RuntimeError(f"safe-parse failed: {job['error_message']}")
        time.sleep(2)
 
    artifacts = {a["artifact_kind"]: a["id"] for a in job["artifacts"]}
 
    # Fetch the sanitized canonical doc and check for any remaining PII markers
    r = httpx.get(
        f"{BASE_URL}/api/v1/documents/{artifacts['sanitized_canonical_document']}/content",
        headers=headers,
    )
    doc = r.json()
 
    total_blocks = sum(len(page["blocks"]) for page in doc["pages"])
    redacted_blocks = sum(
        1
        for page in doc["pages"]
        for block in page["blocks"]
        if "[" in block["text"] and "]" in block["text"]
    )
    print(f"Redaction coverage: {redacted_blocks}/{total_blocks} blocks contain redaction labels")
 
    return artifacts
 
# Example
artifacts = safe_parse_and_verify("patient_intake.pdf", policy_id="pol_hipaa_strict")
print(f"Sanitized doc: {artifacts['sanitized_canonical_document']}")
print(f"Sanitized markdown: {artifacts['sanitized_markdown_render']}")

Invoice extraction

Extract structured fields from a batch of invoices using the built-in invoice template. Parse once, extract fields, then post-process the results.

Python


import time
import httpx
 
API_KEY = "pk_live_abc123"
BASE_URL = "https://api.expunct.ai"
headers = {"X-API-Key": API_KEY}
 
def extract_invoice(file_path: str) -> dict:
    """Extract invoice fields. Returns the extract_result content."""
 
    with open(file_path, "rb") as f:
        r = httpx.post(
            f"{BASE_URL}/api/v1/extract",
            headers=headers,
            files={"file": (file_path, f, "application/pdf")},
            data={"template_id": "invoice"},
        )
    r.raise_for_status()
    job_id = r.json()["id"]
 
    while True:
        r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers)
        job = r.json()
        if job["status"] == "completed":
            break
        elif job["status"] == "failed":
            raise RuntimeError(job["error_message"])
        time.sleep(2)
 
    result_artifact = next(
        a for a in job["artifacts"] if a["artifact_kind"] == "extract_result"
    )
    r = httpx.get(
        f"{BASE_URL}/api/v1/documents/{result_artifact['id']}/content",
        headers=headers,
    )
    return r.json()
 
# Process a batch
invoices = ["inv_001.pdf", "inv_002.pdf", "inv_003.pdf"]
 
for invoice_file in invoices:
    result = extract_invoice(invoice_file)
 
    # Build a dict of high-confidence field values
    fields = {
        f["field_name"]: f["value"]
        for f in result["fields"]
        if f["confidence"] >= 0.7  # discard low-confidence extractions
    }
 
    print(f"{invoice_file}:")
    print(f"  Invoice #: {fields.get('invoice_number', 'N/A')}")
    print(f"  Total: {fields.get('total_amount', 'N/A')}")
    print(f"  Vendor: {fields.get('vendor_name', 'N/A')}")
    if result["validation_errors"]:
        print(f"  Warnings: {result['validation_errors']}")

Reuse a parse artifact for multiple extractions

If you need to extract different field sets from the same document, parse once and extract multiple times without re-uploading the file.

Python


import time
import httpx
 
API_KEY = "pk_live_abc123"
BASE_URL = "https://api.expunct.ai"
headers = {"X-API-Key": API_KEY}
 
def poll_job(job_id: str) -> dict:
    while True:
        r = httpx.get(f"{BASE_URL}/api/v1/documents/jobs/{job_id}", headers=headers)
        job = r.json()
        if job["status"] in ("completed", "failed"):
            return job
        time.sleep(2)
 
# Step 1: Parse the document once
with open("contract.pdf", "rb") as f:
    r = httpx.post(
        f"{BASE_URL}/api/v1/parse",
        headers=headers,
        files={"file": ("contract.pdf", f, "application/pdf")},
    )
parse_job = poll_job(r.json()["id"])
canonical_id = next(
    a["id"] for a in parse_job["artifacts"]
    if a["artifact_kind"] == "canonical_document"
)
 
# Step 2a: Extract with the invoice template
r = httpx.post(
    f"{BASE_URL}/api/v1/extract",
    headers=headers,
    data={"parse_artifact_id": canonical_id, "template_id": "invoice"},
)
invoice_job = poll_job(r.json()["id"])
 
# Step 2b: Extract with a custom schema (no re-upload)
custom_schema = '{"type":"object","properties":{"governing_law":{"type":"string"},"termination_clause":{"type":"string"}}}'
r = httpx.post(
    f"{BASE_URL}/api/v1/extract",
    headers=headers,
    data={"parse_artifact_id": canonical_id, "extraction_schema": custom_schema},
)
clauses_job = poll_job(r.json()["id"])
 
print("Invoice extraction:", invoice_job["status"])
print("Clause extraction:", clauses_job["status"])

Node.js


const API_KEY = 'pk_live_abc123';
const BASE_URL = 'https://api.expunct.ai';
const headers = { 'X-API-Key': API_KEY };
 
async function pollJob(jobId: string) {
  while (true) {
    const r = await fetch(`${BASE_URL}/api/v1/documents/jobs/${jobId}`, { headers });
    const job = await r.json();
    if (job.status === 'completed' || job.status === 'failed') return job;
    await new Promise((r) => setTimeout(r, 2000));
  }
}
 
// Step 1: Parse the document once
const parseForm = new FormData();
parseForm.append('file', fs.createReadStream('contract.pdf'), 'contract.pdf');
 
const parseRes = await fetch(`${BASE_URL}/api/v1/parse`, {
  method: 'POST',
  headers: { ...headers, ...parseForm.getHeaders() },
  body: parseForm,
});
const parseJob = await pollJob((await parseRes.json()).id);
const canonicalId = parseJob.artifacts.find(
  (a: any) => a.artifact_kind === 'canonical_document',
)?.id;
 
// Step 2a: Invoice template extraction
const invoiceForm = new FormData();
invoiceForm.append('parse_artifact_id', canonicalId);
invoiceForm.append('template_id', 'invoice');
 
const invoiceRes = await fetch(`${BASE_URL}/api/v1/extract`, {
  method: 'POST',
  headers: { ...headers, ...invoiceForm.getHeaders() },
  body: invoiceForm,
});
const invoiceJob = await pollJob((await invoiceRes.json()).id);
 
// Step 2b: Custom schema extraction (no re-upload)
const customSchema = JSON.stringify({
  type: 'object',
  properties: {
    governing_law: { type: 'string' },
    termination_clause: { type: 'string' },
  },
});
const clauseForm = new FormData();
clauseForm.append('parse_artifact_id', canonicalId);
clauseForm.append('extraction_schema', customSchema);
 
const clauseRes = await fetch(`${BASE_URL}/api/v1/extract`, {
  method: 'POST',
  headers: { ...headers, ...clauseForm.getHeaders() },
  body: clauseForm,
});
const clausesJob = await pollJob((await clauseRes.json()).id);
 
console.log('Invoice extraction:', invoiceJob.status);
console.log('Clause extraction:', clausesJob.status);