SYSTEM / app.py
Andy0830's picture
Update app.py
8c3f3bc verified
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import os
import sys
import gc
import time
# --- 系統設定 ---
SYSTEM_TITLE = "花蓮慈濟醫院公文輔助判決系統"
FILE_PATH = 'data.csv'
INDEX_FILE = 'corpus_embeddings.pt'
# ▼▼▼ 設定登入帳號密碼 (已更新) ▼▼▼
# 格式:("帳號", "密碼")
LOGIN_DATA = ("admin", "htch15583")
# --- 1. 讀取資料 ---
print("🚀 正在啟動系統...")
if not os.path.exists(FILE_PATH):
print(f"❌ 錯誤:找不到 {FILE_PATH}")
sys.exit(1)
try:
# 讀取檔案 (CP950 優先)
df = pd.read_csv(FILE_PATH, encoding='cp950')
except UnicodeDecodeError:
try:
df = pd.read_csv(FILE_PATH, encoding='big5')
except Exception:
df = pd.DataFrame()
except Exception:
df = pd.DataFrame()
# --- 2. 資料清洗 ---
if not df.empty:
# 移除 BOM 與空白
df.columns = [str(c).strip().replace('\ufeff', '') for c in df.columns]
# 自動對應欄位
for col in df.columns:
if '主旨' in col or '內容' in col: df.rename(columns={col: '主旨'}, inplace=True)
if '窗口' in col or '單位' in col: df.rename(columns={col: '收文窗口'}, inplace=True)
# 轉字串 & 移除空值
df['主旨'] = df['主旨'].astype(str)
df['收文窗口'] = df['收文窗口'].astype(str)
df = df.dropna(subset=['主旨', '收文窗口'])
corpus = df['主旨'].tolist()
total_records = len(corpus)
print(f"📊 載入全量資料: {total_records} 筆")
else:
print("❌ 資料表是空的!")
corpus = []
total_records = 0
# --- 3. 載入模型與建立索引 ---
model = None
try:
print("🧠 正在載入模型 (BAAI/bge-small-zh-v1.5)...")
model = SentenceTransformer('BAAI/bge-small-zh-v1.5')
except Exception as e:
print(f"❌ 模型載入失敗: {e}")
corpus_embeddings = None
if total_records > 0 and model is not None:
# 檢查是否有快取檔案
if os.path.exists(INDEX_FILE):
print(f"⚡ 偵測到快取檔案,正在秒速載入...")
try:
corpus_embeddings = torch.load(INDEX_FILE)
print("✅ 索引載入完成!")
except Exception as e:
print(f"❌ 快取檔案損壞,將重新計算。錯誤: {e}")
corpus_embeddings = None
# 如果沒有快取,則進行計算
if corpus_embeddings is None:
print(f"🔥 開始計算索引 (需時約 2-4 分鐘,請耐心等候)...")
chunk_size = 500
embeddings_chunks = []
try:
for i in range(0, total_records, chunk_size):
batch = corpus[i : i + chunk_size]
batch_emb = model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
embeddings_chunks.append(batch_emb)
print(f" -> 已處理 {min(i + chunk_size, total_records)} / {total_records} 筆...")
gc.collect()
corpus_embeddings = torch.cat(embeddings_chunks)
# 儲存到硬碟,下次啟動就會很快
torch.save(corpus_embeddings, INDEX_FILE)
print("✅ 索引計算並儲存完成!")
except Exception as e:
print(f"❌ 索引計算失敗: {e}")
corpus_embeddings = None
# --- 4. 定義搜尋 ---
def search_department(query):
if corpus_embeddings is None:
return "⚠️ 系統初始化失敗,請檢查 Logs。"
if not query.strip():
return "請輸入公文主旨..."
query_embedding = model.encode(query, convert_to_tensor=True)
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
top_k = min(3, len(corpus))
top_results = torch.topk(cos_scores, k=top_k)
output_text = f"🔍 分析結果:\n" + "="*30 + "\n"
for score, idx in zip(top_results.values, top_results.indices):
idx = idx.item()
if idx < len(df):
row = df.iloc[idx]
score_val = score.item()
if score_val > 0.7: confidence = "⭐⭐⭐ 極高"
elif score_val > 0.55: confidence = "⭐⭐ 高"
else: confidence = "⭐ 參考"
output_text += f"【推薦單位】:{row['收文窗口']}\n"
output_text += f" - 歷史案例:{row['主旨']}\n"
output_text += f" - 相似度:{score_val:.4f} ({confidence})\n"
output_text += "-"*20 + "\n"
return output_text
# --- 5. 介面 (已啟用密碼鎖) ---
iface = gr.Interface(
fn=search_department,
inputs=gr.Textbox(lines=3, placeholder="請輸入公文主旨..."),
outputs=gr.Textbox(lines=12, label="AI 判決建議"),
title=SYSTEM_TITLE,
description=f"系統狀態:{'🟢 系統正常' if corpus_embeddings is not None else '🔴 異常'}\n資料庫收錄:{total_records} 筆歷史資料",
examples=[["檢送本署彙整人工生殖機構之捐贈生殖細胞使用情形"], ["函轉衛生局關於流感疫苗接種計畫"]]
)
if __name__ == "__main__":
# 啟動時加入驗證
iface.launch(auth=LOGIN_DATA)