Bài 8: Capstone Project - Vietnamese Legal Assistant¶

Tổng quan¶

Bài này tích hợp toàn bộ kiến thức Module I thành một hệ thống hoàn chỉnh: pipeline xử lý tài liệu pháp lý tiếng Việt → CRAG agent → guardrails → UI với Gradio → deploy lên HF Spaces.

Kiến trúc Hệ thống¶

graph TD
    subgraph Ingestion
        PDF[Vietnamese Legal PDFs] --> LOAD[Document Loader]
        LOAD --> CHUNK[Chunking<br/>512 chars, 64 overlap]
        CHUNK --> EMBED[multilingual-e5-large]
        EMBED --> VDB[(Qdrant / Chroma)]
    end

    subgraph CRAG Agent - LangGraph
        Q[User Query] --> GUARD_IN[Input Guardrails<br/>PII + Injection check]
        GUARD_IN --> RETRIEVE[Retrieve Top-10]
        VDB --> RETRIEVE
        RETRIEVE --> GRADE[Grade Chunks<br/>LLM-as-judge]
        GRADE --> DECIDE{Quality?}
        DECIDE -->|Good| GEN[Generate Answer]
        DECIDE -->|Poor| WEB[Tavily Web Search]
        WEB --> GEN
        GEN --> GUARD_OUT[Output Guardrails<br/>Faithfulness check]
        GUARD_OUT --> CITE[Add Citations]
    end

    subgraph Presentation
        CITE --> UI[Gradio UI]
        UI --> DEPLOY[HF Spaces]
    end

1. Ingestion Pipeline¶

import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

def build_ingestion_pipeline(pdf_dir: str, persist_dir: str):
    """
    Ingestion pipeline cho Vietnamese legal PDFs.

    Args:
        pdf_dir: Thư mục chứa PDFs
        persist_dir: Nơi lưu vector store
    """
    print(f"📄 Loading PDFs from {pdf_dir}...")
    loader = PyPDFDirectoryLoader(pdf_dir, glob="**/*.pdf")
    docs = loader.load()
    print(f"✅ Loaded {len(docs)} pages from {len(set(d.metadata['source'] for d in docs))} files")

    # Chunking
    print("✂️ Chunking documents...")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=64,
        separators=["\n\n", "\n", ".", " "],
    )
    chunks = splitter.split_documents(docs)

    # Enrich metadata
    for chunk in chunks:
        chunk.metadata["chunk_size"] = len(chunk.page_content)
        source = Path(chunk.metadata["source"]).stem
        chunk.metadata["law_name"] = source

    print(f"✅ Created {len(chunks)} chunks")

    # Embedding & indexing
    print("🔢 Embedding and indexing...")
    embedding = HuggingFaceEmbeddings(
        model_name="intfloat/multilingual-e5-large",
        encode_kwargs={"normalize_embeddings": True},
    )

    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embedding,
        persist_directory=persist_dir,
        collection_name="legal_vi",
    )

    print(f"✅ Indexed {vectorstore._collection.count()} chunks → {persist_dir}")
    return vectorstore


# Chạy ingestion
vectorstore = build_ingestion_pipeline(
    pdf_dir="./data/legal_pdfs/",
    persist_dir="./data/vectorstore/",
)

2. CRAG Agent với LangGraph¶

from typing import TypedDict
from langgraph.graph import StateGraph, END
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.tools.tavily_search import TavilySearchResults
import json

# ====== STATE ======
class LegalAgentState(TypedDict):
    question: str
    documents: list
    generation: str
    web_search: bool
    citations: list
    guardrail_passed: bool

# ====== LLM + TOOLS ======
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
web_search = TavilySearchResults(max_results=3)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# ====== PROMPTS ======
GRADE_PROMPT = """Đánh giá xem TÀI LIỆU có liên quan đến CÂU HỎI pháp lý không.

Câu hỏi: {question}
Tài liệu: {document}

Trả về JSON: {{"relevant": true/false, "score": 0.0-1.0}}"""

RAG_PROMPT = """Bạn là chuyên gia pháp lý Việt Nam. Trả lời câu hỏi DỰA TRÊN TÀI LIỆU.
Nếu không có thông tin, nói rõ "Không tìm thấy thông tin trong tài liệu hiện có".
Trích dẫn điều khoản cụ thể khi có thể.

Tài liệu:
{context}

Câu hỏi: {question}

Câu trả lời:"""

# ====== NODES ======
def retrieve(state: LegalAgentState):
    docs = retriever.invoke(state["question"])
    return {"documents": docs, "web_search": False}

def grade_documents(state: LegalAgentState):
    relevant_docs = []
    web_search_needed = False

    for doc in state["documents"]:
        try:
            result = llm.invoke(GRADE_PROMPT.format(
                question=state["question"],
                document=doc.page_content[:500]
            ))
            grade = json.loads(result.content)
            if grade.get("relevant") and grade.get("score", 0) > 0.6:
                relevant_docs.append(doc)
        except:
            relevant_docs.append(doc)  # Keep on error

    if len(relevant_docs) < 2:
        web_search_needed = True

    return {"documents": relevant_docs, "web_search": web_search_needed}

def web_search_node(state: LegalAgentState):
    results = web_search.invoke(state["question"] + " luật Việt Nam")
    from langchain_core.documents import Document
    web_docs = [
        Document(
            page_content=r["content"],
            metadata={"source": r["url"], "type": "web"}
        )
        for r in results
    ]
    return {"documents": state["documents"] + web_docs}

def generate(state: LegalAgentState):
    context = "\n\n---\n\n".join([
        f"[Nguồn: {d.metadata.get('law_name', d.metadata.get('source', 'web'))}, "
        f"Trang {d.metadata.get('page', '?')}]\n{d.page_content}"
        for d in state["documents"]
    ])

    answer = llm.invoke(RAG_PROMPT.format(
        context=context,
        question=state["question"]
    ))

    citations = [
        {"source": d.metadata.get("law_name", "web"),
         "page": d.metadata.get("page", "?")}
        for d in state["documents"]
    ]

    return {
        "generation": answer.content,
        "citations": citations,
        "guardrail_passed": True,
    }

# ====== ROUTING ======
def route_web_search(state: LegalAgentState) -> str:
    return "web_search" if state["web_search"] else "generate"

# ====== BUILD GRAPH ======
workflow = StateGraph(LegalAgentState)
workflow.add_node("retrieve", retrieve)
workflow.add_node("grade_documents", grade_documents)
workflow.add_node("web_search", web_search_node)
workflow.add_node("generate", generate)

workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    route_web_search,
    {"web_search": "web_search", "generate": "generate"}
)
workflow.add_edge("web_search", "generate")
workflow.add_edge("generate", END)

legal_agent = workflow.compile()

3. Guardrails Integration¶

import re

def input_guardrails(question: str) -> tuple[bool, str]:
    """Returns (passed, reason)"""

    # Injection check
    injection_patterns = [
        r"ignore.*(instruction|system|prompt)",
        r"bỏ qua.*hướng dẫn",
        r"you are now",
        r"act as",
    ]
    for pattern in injection_patterns:
        if re.search(pattern, question.lower()):
            return False, "Phát hiện prompt injection"

    # Length check
    if len(question) > 2000:
        return False, "Câu hỏi quá dài (tối đa 2000 ký tự)"

    if len(question.strip()) < 5:
        return False, "Câu hỏi quá ngắn"

    return True, "OK"

def output_guardrails(answer: str) -> tuple[bool, str]:
    """Returns (passed, cleaned_answer)"""

    # Remove potential leaked system info
    sensitive_patterns = [r"system prompt:", r"<system>"]
    for pattern in sensitive_patterns:
        answer = re.sub(pattern, "", answer, flags=re.IGNORECASE)

    return True, answer.strip()

def run_agent_with_guardrails(question: str) -> dict:
    # Input check
    passed, reason = input_guardrails(question)
    if not passed:
        return {"answer": f"⚠️ {reason}", "citations": [], "web_used": False}

    # Run CRAG agent
    result = legal_agent.invoke({"question": question})

    # Output check
    _, clean_answer = output_guardrails(result["generation"])

    return {
        "answer": clean_answer,
        "citations": result.get("citations", []),
        "web_used": result.get("web_search", False),
    }

4. Gradio UI¶

import gradio as gr

def chat_interface(question: str, history: list) -> tuple[str, list]:
    result = run_agent_with_guardrails(question)

    answer = result["answer"]

    # Thêm citation info
    if result["citations"]:
        sources = list(set([
            f"{c['source']} tr.{c['page']}" 
            for c in result["citations"]
        ]))
        answer += f"\n\n📚 *Nguồn: {', '.join(sources[:3])}*"

    if result["web_used"]:
        answer += "\n\n🌐 *Một phần thông tin từ tìm kiếm web*"

    history.append((question, answer))
    return "", history

with gr.Blocks(title="Trợ lý Pháp lý Việt Nam", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # ⚖️ Trợ lý Pháp lý Việt Nam
    Đặt câu hỏi về luật doanh nghiệp, luật lao động và các văn bản pháp luật Việt Nam.

    > ⚠️ Đây là công cụ tham khảo, không thay thế tư vấn pháp lý chính thức.
    """)

    chatbot = gr.Chatbot(height=500, show_label=False)

    with gr.Row():
        msg = gr.Textbox(
            placeholder="Nhập câu hỏi pháp lý của bạn...",
            show_label=False,
            scale=9,
        )
        submit = gr.Button("Gửi", scale=1, variant="primary")

    examples = gr.Examples(
        examples=[
            "Điều kiện thành lập công ty TNHH 2 thành viên?",
            "Người lao động có quyền nghỉ phép năm bao nhiêu ngày?",
            "Thủ tục đăng ký kinh doanh hộ cá thể?",
        ],
        inputs=msg,
    )

    submit.click(chat_interface, [msg, chatbot], [msg, chatbot])
    msg.submit(chat_interface, [msg, chatbot], [msg, chatbot])

demo.launch(share=True)

5. Deploy lên HF Spaces¶

# Cấu trúc project
vietnamese-legal-assistant/
├── app.py              # Gradio app (entry point)
├── requirements.txt    # Dependencies
├── README.md           # HF Spaces config (yaml header)
└── data/
    └── vectorstore/    # Pre-built vector store (upload lên HF)

# README.md header cho HF Spaces
---
title: Vietnamese Legal Assistant
emoji: ⚖️
colorFrom: blue
colorTo: indigo
sdk: gradio
sdk_version: 4.x
app_file: app.py
pinned: false
---

# requirements.txt
gradio>=4.0
langchain>=0.3
langchain-openai
langchain-community
langchain-chroma
sentence-transformers
ragas
langgraph
tavily-python

# Deploy
pip install huggingface_hub
huggingface-cli login

# Push toàn bộ repo
git init
git add .
git commit -m "Initial commit"
git remote add origin https://huggingface.co/spaces/username/vietnamese-legal-assistant
git push origin main

6. RAGAS Evaluation - Take-home¶

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision
from datasets import Dataset

# 30 Q&A pairs được tạo từ tài liệu pháp lý
test_questions = [
    "Công ty TNHH có thể có tối đa bao nhiêu thành viên?",
    # ... 29 câu khác
]

# Chạy agent cho mỗi câu hỏi
results = []
for q in test_questions:
    agent_result = legal_agent.invoke({"question": q})
    results.append({
        "question": q,
        "answer": agent_result["generation"],
        "contexts": [d.page_content for d in agent_result["documents"]],
    })

eval_dataset = Dataset.from_list(results)
scores = evaluate(
    dataset=eval_dataset,
    metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
)

print("\n📊 RAGAS Evaluation Results:")
print(f"Faithfulness:        {scores['faithfulness']:.3f}")
print(f"Answer Relevancy:    {scores['answer_relevancy']:.3f}")
print(f"Context Recall:      {scores['context_recall']:.3f}")
print(f"Context Precision:   {scores['context_precision']:.3f}")

Checklist Hoàn thiện Dự án¶

Ingestion pipeline xử lý được ≥ 5 PDFs pháp lý tiếng Việt
CRAG agent chạy được với LangGraph
Web search fallback hoạt động
Input/output guardrails active
Gradio UI với chat history và citations
Deploy thành công lên HF Spaces
RAGAS evaluation với 30 Q&A pairs, tất cả 4 metrics
Faithfulness > 0.80 trên test set