# Stakeholder Identification - 02

In [1]:
# === 02) Stakeholder Identification ‚Äî LLM JSONL ‚Üí Editable Table (locked Subject/State/Issue) ===
# Requires: pip install gradio openai pandas pyarrow

import os, json, re
from pathlib import Path
from datetime import datetime
import pandas as pd
import gradio as gr

# ----------------------------
# Project ROOT (same logic as 00)
# ----------------------------
def find_project_root(max_up=8):
    p = Path.cwd()
    for _ in range(max_up):
        if (p / "configs").exists() and ((p / "src").exists() or (p / "notebooks").exists()):
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_project_root()

# ----------------------------
# Helpers
# ----------------------------
def _slugify(s: str) -> str:
    return re.sub(r"[^0-9a-zA-Z]+", "-", (s or "").strip().lower()).strip("-") or "unknown"

def _state_output_dir(state: str) -> Path:
    s = (state or "").strip()
    primary = ROOT / "outputs" / _slugify(s)
    legacy  = ROOT / "outputs" / _slugify(s.replace(" State",""))
    if primary.exists(): return primary
    if legacy.exists():  return legacy
    primary.mkdir(parents=True, exist_ok=True)
    return primary

def load_latest_pea_row_from_any_state() -> dict:
    """
    Find the MOST RECENT pea_summaries.csv across outputs/*/**, and return that single row.
    If you prefer to pin to run.yaml's current_state, swap this for load_latest_pea_row(state).
    """
    out_dir = ROOT / "outputs"
    if not out_dir.exists():
        return {}
    candidates = sorted(out_dir.glob("*/pea_summaries.csv"))
    if not candidates:
        return {}
    # pick the newest file by mtime
    latest_file = max(candidates, key=lambda p: p.stat().st_mtime)
    try:
        df = pd.read_csv(latest_file)
        if df.empty:
            return {}
        if "timestamp_utc" in df.columns:
            df = df.sort_values("timestamp_utc").tail(1)
        else:
            df = df.tail(1)
        r = df.iloc[0].to_dict()
        return {
            "subject": r.get("subject",""),
            "state": r.get("state",""),
            "issue": r.get("issue_focus",""),
            "summary": r.get("summary",""),
            "source_file": str(latest_file)
        }
    except Exception:
        return {}

def ensure_grid_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = [
        "stakeholder_type","entity","category","justification",
        "source_ref","state_relevance","issue_relevance","confidence","drop"
    ]
    if df is None or df.empty:
        return pd.DataFrame(columns=cols)
    df = df.copy()
    for c in cols:
        if c not in df.columns:
            df[c] = "" if c not in ("state_relevance","issue_relevance","confidence","drop") else None
    # types
    df["drop"] = df["drop"].fillna(False).astype(bool)
    for c in ["state_relevance","issue_relevance","confidence"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in ["stakeholder_type","entity","category","justification","source_ref"]:
        df[c] = df[c].fillna("").astype(str)
    return df[cols]

def _page(df: pd.DataFrame, page: int, page_size: int = 20):
    df = ensure_grid_columns(df)
    if df.empty:
        return df
    start = max(0, page * page_size)
    end = start + page_size
    return df.iloc[start:end].reset_index(drop=True)

# ----------------------------
# OpenAI config/client
# ----------------------------
def _load_openai_cfg():
    cfg_path = ROOT / "configs" / "config.json"
    cfg = {}
    if cfg_path.exists():
        try:
            with cfg_path.open("r", encoding="utf-8") as f:
                cfg = json.load(f) or {}
        except Exception:
            cfg = {}
    key = (
        os.getenv("OPENAI_API_KEY") or
        os.getenv("OPEN_API_KEY") or
        cfg.get("OPENAI_API_KEY") or
        cfg.get("OPEN_API_KEY")
    )
    if key:
        cfg["OPENAI_API_KEY"] = key
    base = os.getenv("OPENAI_BASE_URL") or cfg.get("ENVIRONMENT_URL") or cfg.get("OPENAI_BASE_URL")
    if base:
        cfg["OPENAI_BASE_URL"] = base
    cfg["MODEL"] = cfg.get("MODEL","gpt-4o-mini")
    return cfg

def _openai_client_or_none():
    cfg = _load_openai_cfg()
    api_key = cfg.get("OPENAI_API_KEY")
    if not api_key:
        return None, "No OPENAI_API_KEY (or OPEN_API_KEY) found in configs/config.json or environment."
    try:
        from openai import OpenAI
        kwargs = {"api_key": api_key}
        if cfg.get("OPENAI_BASE_URL"):
            kwargs["base_url"] = cfg["OPENAI_BASE_URL"]
        return OpenAI(**kwargs), None
    except Exception as e:
        return None, f"OpenAI client init failed: {e}"

# ----------------------------
# Prompts (guard-railed)
# ----------------------------
SYSTEM_MSG = """You are a careful Political Economy Analyst.

Ethical guardrails (MANDATORY):
- Do NOT include or infer any private personal data about private individuals.
- You may include public officials by role or by name ONLY if clearly public (e.g., Governor, Commissioner) AND relevant to the task.
- No doxxing, no speculative claims, no unverified allegations.
- Every row MUST include specific, citable evidence and stay within the jurisdictional scope.
- If evidence is weak, lower confidence and clearly mark uncertainty."""

def build_user_prompt(subject: str, state: str, issue: str) -> str:
    return f"""
Goal: produce a thorough, evidence-based list of stakeholders that directly or indirectly impact, or are affected by, {subject.lower()} in {{state}}.

Include: natural persons (public officeholders only), collectives, companies, government bodies, donor orgs, CSOs/NGOs/INGOs, media, private sector, CBOs.

Stakeholder Types (use the closest): 
- Government (executive, legislative, MDAs, boards, statutory authorities, regulators)
- Civil Society Organisations (CSOs/NGOs/INGOs)
- Community-Based Organisations
- Media
- Private Sector
- Donors/International Partners
- Natural persons (public officials only; cite role/source)
- Other/Unknown (sparingly)

Compulsory evidence for every row:
- Provide specific, verifiable justification tied to a precise source location.
- For PDFs: page/section/table/economic code; for web: Title ‚Äî URL.
- Prefer local/state relevance; exclude generic national references unless they concretely touch {{state}}.

Issue tailoring:
- Focus on the issue: "{issue}". Prioritize stakeholders relevant to this issue in {state}.

Return ONLY JSONL lines with exactly these fields:
- stakeholder_type (string)
- entity           (string)
- category         ("Core" | "Secondary" | "Peripheral")
- justification    (2‚Äì3 sentences; evidence + why relevant)
- source_ref       (precise PDF page/section/code OR "Title ‚Äî URL")
- state_relevance  (0‚Äì1 float)
- issue_relevance  (0‚Äì1 float)
- confidence       (0‚Äì1 float)

Rules:
- No duplicates across categories.
- Use named officeholders where verifiable; otherwise use role with doc citation.
- If evidence is insufficient, set confidence ‚â§ 0.5 and still cite the best source snippet.

Now, produce up to 60 JSONL rows (respecting the rule above) for:
- state = "{state}"
- issue = "{issue}"
""".strip()

def call_llm_for_stakeholders(subject: str, state: str, issue: str) -> str:
    client, err = _openai_client_or_none()
    model = _load_openai_cfg().get("MODEL","gpt-4o-mini")
    if client is None:
        return ""
    try:
        resp = client.chat.completions.create(
            model=model,
            temperature=0.2,
            messages=[
                {"role":"system","content": SYSTEM_MSG},
                {"role":"user","content": build_user_prompt(subject, state, issue)}
            ],
        )
        return (resp.choices[0].message.content or "").strip()
    except Exception:
        return ""

def parse_jsonl_to_df(jsonl_text: str) -> pd.DataFrame:
    cols = [
        "stakeholder_type","entity","category","justification",
        "source_ref","state_relevance","issue_relevance","confidence","drop"
    ]
    if not isinstance(jsonl_text, str) or not jsonl_text.strip():
        return pd.DataFrame(columns=cols)
    rows = []
    for line in jsonl_text.splitlines():
        ln = re.sub(r'^[\-\*\d\.\)\s]+','', (line or "").strip())
        if not ln:
            continue
        try:
            obj = json.loads(ln)
        except Exception:
            continue
        row = {k: obj.get(k) for k in cols if k != "drop"}
        row["drop"] = False
        rows.append(row)
    return ensure_grid_columns(pd.DataFrame(rows))

# ----------------------------
# Save / Final-save
# ----------------------------
def save_page_snapshot(state: str, df_page: pd.DataFrame, full_df: pd.DataFrame, page_idx: int, page_size=20):
    full_df = ensure_grid_columns(full_df)
    df_page = ensure_grid_columns(pd.DataFrame(df_page))
    start = page_idx * page_size
    end = start + len(df_page)
    # align columns
    full_df.iloc[start:start+len(df_page), :] = df_page.values

    out_dir = _state_output_dir(state)
    ts = datetime.utcnow().isoformat(timespec="seconds") + "Z"
    latest_path = out_dir / "stakeholders_llm_candidates.csv"
    snap_path   = out_dir / f"stakeholders_llm_candidates_{ts.replace(':','-')}.csv"

    full_df.to_csv(latest_path, index=False)   # overwrite latest
    full_df.to_csv(snap_path, index=False)     # keep snapshot
    return f"‚úÖ Saved {len(full_df)} rows.\n- Latest: {latest_path}\n- Snapshot: {snap_path}", full_df

def final_save_overwrite(subject: str, state: str, issue: str, full_df: pd.DataFrame) -> str:
    df = ensure_grid_columns(full_df)
    # Drop marked rows + empty entities
    df = df[(~df["drop"]) & (df["entity"].astype(str).str.strip()!="")].copy()

    # Add context columns from latest pea_summaries.csv (source of truth)
    latest = load_latest_pea_row_from_any_state()
    if latest:
        subject = latest.get("subject", subject)
        state   = latest.get("state", state)
        issue   = latest.get("issue", issue)
        summary = latest.get("summary","")
    else:
        summary = ""

    ts = datetime.utcnow().isoformat(timespec="seconds") + "Z"
    df.insert(0, "timestamp_utc", ts)
    df.insert(1, "subject", subject)
    df.insert(2, "state", state)
    df.insert(3, "issue_focus", issue)
    df.insert(4, "pea_summary_excerpt", (summary[:400] + "‚Ä¶") if isinstance(summary, str) and len(summary) > 400 else summary)

    out_dir = _state_output_dir(state)
    final_path = out_dir / "pea_summaries_stakeholders.csv"
    snap_path  = out_dir / f"pea_summaries_stakeholders_{ts.replace(':','-')}.csv"

    # OVERWRITE latest (to avoid old rows reappearing), and keep a snapshot
    df.to_csv(final_path, index=False, mode="w")
    df.to_csv(snap_path,  index=False, mode="w")
    return f"‚úÖ Final stakeholders saved (overwritten):\n- Latest: {final_path}\n- Snapshot: {snap_path}"

# ----------------------------
# Prefill locked subject/state/issue from latest pea_summaries.csv
# ----------------------------
latest = load_latest_pea_row_from_any_state()
if not latest or not all([latest.get("subject"), latest.get("state"), latest.get("issue")]):
    raise RuntimeError("No recent pea_summaries.csv found in outputs/<state>/. Run Analysis_Initiation_01 first.")

LOCK_SUBJECT = latest["subject"]
LOCK_STATE   = latest["state"]
LOCK_ISSUE   = latest["issue"]
LOCK_SRCFILE = latest.get("source_file","(unknown)")

# ----------------------------
# Gradio UI (Subject/State/Issue locked)
# ----------------------------
with gr.Blocks(title="Stakeholder Identification") as demo:
    gr.Markdown("## Stakeholder Identification ‚Äî LLM Draft ‚Üí Human-in-the-Loop")
    gr.Markdown(f"**Loaded from:** `{LOCK_SRCFILE}`")

    with gr.Row():
        # Locked fields: not editable
        subject_tb = gr.Textbox(label="Subject (locked)", value=LOCK_SUBJECT, interactive=False)
        state_tb   = gr.Textbox(label="State/Location (locked)", value=LOCK_STATE, interactive=False)
        issue_tb   = gr.Textbox(label="Issue Focus (locked)", value=LOCK_ISSUE, interactive=False)

    with gr.Row():
        run_btn  = gr.Button("Generate/Refresh from LLM", variant="primary")
        add_btn  = gr.Button("Add Blank Row", variant="secondary")
        del_btn  = gr.Button("Delete Marked (drop=True)", variant="stop")

    with gr.Row():
        prev_btn = gr.Button("‚óÄÔ∏é Prev 20", variant="secondary")
        next_btn = gr.Button("Next 20 ‚ñ∂Ô∏é", variant="secondary")
        save_btn = gr.Button("Save Current Page", variant="primary")

    with gr.Row():
        final_btn = gr.Button("Save Final (overwrite) & Finish", variant="primary")
        end_btn   = gr.Button("Finish without Saving", variant="secondary")

    full_df_state  = gr.State(pd.DataFrame())
    page_idx_state = gr.State(0)

    page_df = gr.Dataframe(
        headers=[
            "stakeholder_type","entity","category","justification",
            "source_ref","state_relevance","issue_relevance","confidence","drop"
        ],
        datatype=["str","str","str","str","str","number","number","number","bool"],
        interactive=True,
        row_count=(20, "dynamic"),
        label="Edit rows (20 per page). Set 'drop' to True to mark rows for deletion."
    )
    status_md = gr.Markdown("")

    # --- Handlers ---
    def _on_run(subject, state, issue):
        jsonl = call_llm_for_stakeholders(subject, state, issue)
        df = parse_jsonl_to_df(jsonl)
        first_page = _page(df, 0, 20)
        return df, 0, first_page, f"Draft contains {len(df)} rows. Showing 1‚Äì{len(first_page)}."

    def _on_prev(full_df, page_idx):
        new_idx = max(0, int(page_idx) - 1)
        pg = _page(full_df, new_idx, 20)
        return new_idx, pg, f"Page {new_idx+1}. Showing {len(pg)} rows."

    def _on_next(full_df, page_idx):
        new_idx = int(page_idx) + 1
        pg = _page(full_df, new_idx, 20)
        if pg.empty and int(page_idx) >= 0:
            new_idx = int(page_idx)
            pg = _page(full_df, new_idx, 20)
        return new_idx, pg, f"Page {new_idx+1}. Showing {len(pg)} rows."

    def _on_save(state, df_page, full_df, page_idx):
        msg, updated = save_page_snapshot(state, df_page, full_df.copy(), int(page_idx), 20)
        return msg, updated

    def _on_add(full_df, page_idx):
        df = ensure_grid_columns(full_df).copy()
        blank = {
            "stakeholder_type":"", "entity":"", "category":"", "justification":"",
            "source_ref":"", "state_relevance":None, "issue_relevance":None, "confidence":None, "drop":False
        }
        df = pd.concat([df, pd.DataFrame([blank])], ignore_index=True)
        pg = _page(df, int(page_idx), 20)
        return df, pg, f"‚ûï Added one blank row. Total rows: {len(df)}."

    def _on_delete_marked(full_df, page_idx):
        df = ensure_grid_columns(full_df)
        keep = df[~df["drop"]].reset_index(drop=True)
        total_pages = max(1, (len(keep) + 19)//20)
        new_idx = min(int(page_idx), total_pages-1)
        pg = _page(keep, new_idx, 20)
        return keep, new_idx, pg, f"üóëÔ∏è Deleted marked rows. {len(full_df)-len(keep)} removed."

    def _on_final(subject, state, issue, full_df):
        msg = final_save_overwrite(subject, state, issue, full_df)
        return msg

    def _on_end():
        return "Session finished without saving. You can close this tab."

    # Wire up (note: subject/state/issue are locked values)
    run_btn.click(_on_run, inputs=[subject_tb, state_tb, issue_tb], outputs=[full_df_state, page_idx_state, page_df, status_md])
    prev_btn.click(_on_prev, inputs=[full_df_state, page_idx_state], outputs=[page_idx_state, page_df, status_md])
    next_btn.click(_on_next, inputs=[full_df_state, page_idx_state], outputs=[page_idx_state, page_df, status_md])
    save_btn.click(_on_save, inputs=[state_tb, page_df, full_df_state, page_idx_state], outputs=[status_md, full_df_state])

    add_btn.click(_on_add, inputs=[full_df_state, page_idx_state], outputs=[full_df_state, page_df, status_md])
    del_btn.click(_on_delete_marked, inputs=[full_df_state, page_idx_state], outputs=[full_df_state, page_idx_state, page_df, status_md])

    final_btn.click(_on_final, inputs=[subject_tb, state_tb, issue_tb, full_df_state], outputs=[status_md])
    end_btn.click(_on_end, inputs=[], outputs=[status_md])

demo.launch(inline=True, share=False)

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


