File size: 5,232 Bytes
b873665
e63a87a
 
 
3d381ee
24a7574
ef488f7
 
b873665
e63a87a
7832421
e63a87a
ef488f7
3d381ee
8c3f3bc
0ed4edb
8c3f3bc
0ed4edb
ef488f7
8c3f3bc
69f5fb8
24a7574
d7b3f86
 
7832421
24a7574
8c3f3bc
24a7574
 
7832421
24a7574
ef488f7
d7b3f86
ef488f7
24a7574
e63a87a
d7b3f86
24a7574
8c3f3bc
24a7574
8c3f3bc
 
d7b3f86
 
 
69f5fb8
8c3f3bc
ef488f7
 
24a7574
ef488f7
 
 
 
d7b3f86
 
ef488f7
 
69f5fb8
0ed4edb
ef488f7
 
 
 
 
 
 
 
 
 
8c3f3bc
ef488f7
0ed4edb
ef488f7
 
0ed4edb
ef488f7
0ed4edb
 
d7b3f86
8c3f3bc
ef488f7
8c3f3bc
ef488f7
 
1174aab
ef488f7
 
 
 
 
 
 
 
 
8c3f3bc
0ed4edb
 
ef488f7
 
0ed4edb
ef488f7
a81a66c
ef488f7
e63a87a
24a7574
8c3f3bc
e63a87a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7832421
e63a87a
ef488f7
 
 
3d381ee
7832421
 
 
 
e63a87a
 
7832421
8c3f3bc
e63a87a
 
24a7574
e63a87a
 
0ed4edb
e63a87a
 
8cd002c
7832421
8c3f3bc
0ed4edb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import os
import sys
import gc 
import time

# --- 系統設定 ---
SYSTEM_TITLE = "花蓮慈濟醫院公文輔助判決系統"
FILE_PATH = 'data.csv'
INDEX_FILE = 'corpus_embeddings.pt' 

# ▼▼▼ 設定登入帳號密碼 (已更新) ▼▼▼
# 格式:("帳號", "密碼")
LOGIN_DATA = ("admin", "htch15583") 

# --- 1. 讀取資料 ---
print("🚀 正在啟動系統...")

if not os.path.exists(FILE_PATH):
    print(f"❌ 錯誤:找不到 {FILE_PATH}")
    sys.exit(1)

try:
    # 讀取檔案 (CP950 優先)
    df = pd.read_csv(FILE_PATH, encoding='cp950')
except UnicodeDecodeError:
    try:
        df = pd.read_csv(FILE_PATH, encoding='big5')
    except Exception:
        df = pd.DataFrame()
except Exception:
    df = pd.DataFrame()

# --- 2. 資料清洗 ---
if not df.empty:
    # 移除 BOM 與空白
    df.columns = [str(c).strip().replace('\ufeff', '') for c in df.columns]
    
    # 自動對應欄位
    for col in df.columns:
        if '主旨' in col or '內容' in col: df.rename(columns={col: '主旨'}, inplace=True)
        if '窗口' in col or '單位' in col: df.rename(columns={col: '收文窗口'}, inplace=True)

    # 轉字串 & 移除空值
    df['主旨'] = df['主旨'].astype(str)
    df['收文窗口'] = df['收文窗口'].astype(str)
    df = df.dropna(subset=['主旨', '收文窗口'])
    
    corpus = df['主旨'].tolist()
    total_records = len(corpus)
    print(f"📊 載入全量資料: {total_records} 筆")
else:
    print("❌ 資料表是空的!")
    corpus = []
    total_records = 0

# --- 3. 載入模型與建立索引 ---
model = None
try:
    print("🧠 正在載入模型 (BAAI/bge-small-zh-v1.5)...")
    model = SentenceTransformer('BAAI/bge-small-zh-v1.5')
except Exception as e:
    print(f"❌ 模型載入失敗: {e}")

corpus_embeddings = None

if total_records > 0 and model is not None:
    # 檢查是否有快取檔案
    if os.path.exists(INDEX_FILE):
        print(f"⚡ 偵測到快取檔案,正在秒速載入...")
        try:
            corpus_embeddings = torch.load(INDEX_FILE)
            print("✅ 索引載入完成!")
        except Exception as e:
            print(f"❌ 快取檔案損壞,將重新計算。錯誤: {e}")
            corpus_embeddings = None
    
    # 如果沒有快取,則進行計算
    if corpus_embeddings is None:
        print(f"🔥 開始計算索引 (需時約 2-4 分鐘,請耐心等候)...")
        chunk_size = 500 
        embeddings_chunks = []
        
        try:
            for i in range(0, total_records, chunk_size):
                batch = corpus[i : i + chunk_size]
                batch_emb = model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
                embeddings_chunks.append(batch_emb)
                print(f"   -> 已處理 {min(i + chunk_size, total_records)} / {total_records} 筆...")
                gc.collect() 
            
            corpus_embeddings = torch.cat(embeddings_chunks)
            # 儲存到硬碟,下次啟動就會很快
            torch.save(corpus_embeddings, INDEX_FILE)
            print("✅ 索引計算並儲存完成!")
            
        except Exception as e:
            print(f"❌ 索引計算失敗: {e}")
            corpus_embeddings = None

# --- 4. 定義搜尋 ---
def search_department(query):
    if corpus_embeddings is None:
        return "⚠️ 系統初始化失敗,請檢查 Logs。"
    
    if not query.strip():
        return "請輸入公文主旨..."

    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    
    top_k = min(3, len(corpus))
    top_results = torch.topk(cos_scores, k=top_k)
    
    output_text = f"🔍 分析結果:\n" + "="*30 + "\n"
    
    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item()
        if idx < len(df):
            row = df.iloc[idx]
            score_val = score.item()
            
            if score_val > 0.7: confidence = "⭐⭐⭐ 極高"
            elif score_val > 0.55: confidence = "⭐⭐ 高"
            else: confidence = "⭐ 參考"
            
            output_text += f"【推薦單位】:{row['收文窗口']}\n"
            output_text += f"  - 歷史案例:{row['主旨']}\n"
            output_text += f"  - 相似度:{score_val:.4f} ({confidence})\n"
            output_text += "-"*20 + "\n"
            
    return output_text

# --- 5. 介面 (已啟用密碼鎖) ---
iface = gr.Interface(
    fn=search_department,
    inputs=gr.Textbox(lines=3, placeholder="請輸入公文主旨..."),
    outputs=gr.Textbox(lines=12, label="AI 判決建議"),
    title=SYSTEM_TITLE,
    description=f"系統狀態:{'🟢 系統正常' if corpus_embeddings is not None else '🔴 異常'}\n資料庫收錄:{total_records} 筆歷史資料",
    examples=[["檢送本署彙整人工生殖機構之捐贈生殖細胞使用情形"], ["函轉衛生局關於流感疫苗接種計畫"]]
)

if __name__ == "__main__":
    # 啟動時加入驗證
    iface.launch(auth=LOGIN_DATA)