File size: 5,232 Bytes
b873665 e63a87a 3d381ee 24a7574 ef488f7 b873665 e63a87a 7832421 e63a87a ef488f7 3d381ee 8c3f3bc 0ed4edb 8c3f3bc 0ed4edb ef488f7 8c3f3bc 69f5fb8 24a7574 d7b3f86 7832421 24a7574 8c3f3bc 24a7574 7832421 24a7574 ef488f7 d7b3f86 ef488f7 24a7574 e63a87a d7b3f86 24a7574 8c3f3bc 24a7574 8c3f3bc d7b3f86 69f5fb8 8c3f3bc ef488f7 24a7574 ef488f7 d7b3f86 ef488f7 69f5fb8 0ed4edb ef488f7 8c3f3bc ef488f7 0ed4edb ef488f7 0ed4edb ef488f7 0ed4edb d7b3f86 8c3f3bc ef488f7 8c3f3bc ef488f7 1174aab ef488f7 8c3f3bc 0ed4edb ef488f7 0ed4edb ef488f7 a81a66c ef488f7 e63a87a 24a7574 8c3f3bc e63a87a 7832421 e63a87a ef488f7 3d381ee 7832421 e63a87a 7832421 8c3f3bc e63a87a 24a7574 e63a87a 0ed4edb e63a87a 8cd002c 7832421 8c3f3bc 0ed4edb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import os
import sys
import gc
import time
# --- 系統設定 ---
SYSTEM_TITLE = "花蓮慈濟醫院公文輔助判決系統"
FILE_PATH = 'data.csv'
INDEX_FILE = 'corpus_embeddings.pt'
# ▼▼▼ 設定登入帳號密碼 (已更新) ▼▼▼
# 格式:("帳號", "密碼")
LOGIN_DATA = ("admin", "htch15583")
# --- 1. 讀取資料 ---
print("🚀 正在啟動系統...")
if not os.path.exists(FILE_PATH):
print(f"❌ 錯誤:找不到 {FILE_PATH}")
sys.exit(1)
try:
# 讀取檔案 (CP950 優先)
df = pd.read_csv(FILE_PATH, encoding='cp950')
except UnicodeDecodeError:
try:
df = pd.read_csv(FILE_PATH, encoding='big5')
except Exception:
df = pd.DataFrame()
except Exception:
df = pd.DataFrame()
# --- 2. 資料清洗 ---
if not df.empty:
# 移除 BOM 與空白
df.columns = [str(c).strip().replace('\ufeff', '') for c in df.columns]
# 自動對應欄位
for col in df.columns:
if '主旨' in col or '內容' in col: df.rename(columns={col: '主旨'}, inplace=True)
if '窗口' in col or '單位' in col: df.rename(columns={col: '收文窗口'}, inplace=True)
# 轉字串 & 移除空值
df['主旨'] = df['主旨'].astype(str)
df['收文窗口'] = df['收文窗口'].astype(str)
df = df.dropna(subset=['主旨', '收文窗口'])
corpus = df['主旨'].tolist()
total_records = len(corpus)
print(f"📊 載入全量資料: {total_records} 筆")
else:
print("❌ 資料表是空的!")
corpus = []
total_records = 0
# --- 3. 載入模型與建立索引 ---
model = None
try:
print("🧠 正在載入模型 (BAAI/bge-small-zh-v1.5)...")
model = SentenceTransformer('BAAI/bge-small-zh-v1.5')
except Exception as e:
print(f"❌ 模型載入失敗: {e}")
corpus_embeddings = None
if total_records > 0 and model is not None:
# 檢查是否有快取檔案
if os.path.exists(INDEX_FILE):
print(f"⚡ 偵測到快取檔案,正在秒速載入...")
try:
corpus_embeddings = torch.load(INDEX_FILE)
print("✅ 索引載入完成!")
except Exception as e:
print(f"❌ 快取檔案損壞,將重新計算。錯誤: {e}")
corpus_embeddings = None
# 如果沒有快取,則進行計算
if corpus_embeddings is None:
print(f"🔥 開始計算索引 (需時約 2-4 分鐘,請耐心等候)...")
chunk_size = 500
embeddings_chunks = []
try:
for i in range(0, total_records, chunk_size):
batch = corpus[i : i + chunk_size]
batch_emb = model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
embeddings_chunks.append(batch_emb)
print(f" -> 已處理 {min(i + chunk_size, total_records)} / {total_records} 筆...")
gc.collect()
corpus_embeddings = torch.cat(embeddings_chunks)
# 儲存到硬碟,下次啟動就會很快
torch.save(corpus_embeddings, INDEX_FILE)
print("✅ 索引計算並儲存完成!")
except Exception as e:
print(f"❌ 索引計算失敗: {e}")
corpus_embeddings = None
# --- 4. 定義搜尋 ---
def search_department(query):
if corpus_embeddings is None:
return "⚠️ 系統初始化失敗,請檢查 Logs。"
if not query.strip():
return "請輸入公文主旨..."
query_embedding = model.encode(query, convert_to_tensor=True)
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
top_k = min(3, len(corpus))
top_results = torch.topk(cos_scores, k=top_k)
output_text = f"🔍 分析結果:\n" + "="*30 + "\n"
for score, idx in zip(top_results.values, top_results.indices):
idx = idx.item()
if idx < len(df):
row = df.iloc[idx]
score_val = score.item()
if score_val > 0.7: confidence = "⭐⭐⭐ 極高"
elif score_val > 0.55: confidence = "⭐⭐ 高"
else: confidence = "⭐ 參考"
output_text += f"【推薦單位】:{row['收文窗口']}\n"
output_text += f" - 歷史案例:{row['主旨']}\n"
output_text += f" - 相似度:{score_val:.4f} ({confidence})\n"
output_text += "-"*20 + "\n"
return output_text
# --- 5. 介面 (已啟用密碼鎖) ---
iface = gr.Interface(
fn=search_department,
inputs=gr.Textbox(lines=3, placeholder="請輸入公文主旨..."),
outputs=gr.Textbox(lines=12, label="AI 判決建議"),
title=SYSTEM_TITLE,
description=f"系統狀態:{'🟢 系統正常' if corpus_embeddings is not None else '🔴 異常'}\n資料庫收錄:{total_records} 筆歷史資料",
examples=[["檢送本署彙整人工生殖機構之捐贈生殖細胞使用情形"], ["函轉衛生局關於流感疫苗接種計畫"]]
)
if __name__ == "__main__":
# 啟動時加入驗證
iface.launch(auth=LOGIN_DATA) |