Andy0830 commited on
Commit
ef488f7
·
verified ·
1 Parent(s): d4775c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -57
app.py CHANGED
@@ -4,97 +4,114 @@ from sentence_transformers import SentenceTransformer, util
4
  import torch
5
  import os
6
  import sys
7
- import gc # 引入垃圾回收機制
 
8
 
9
  # --- 系統設定 ---
10
  SYSTEM_TITLE = "花蓮慈濟醫院公文輔助判決系統"
11
  FILE_PATH = 'data.csv'
 
 
12
 
13
- # --- 1. 讀取資料 (維持 CP950 強制讀取) ---
14
- print("🚀 正在啟動輕量模式...準備讀取 CSV")
15
 
16
  if not os.path.exists(FILE_PATH):
17
  print(f"❌ 錯誤:找不到 {FILE_PATH}")
18
  sys.exit(1)
19
 
20
  try:
21
- # 讀取 CSV
22
  df = pd.read_csv(FILE_PATH, encoding='cp950')
23
- print("✅ 資料讀取成功 (cp950)")
24
  except UnicodeDecodeError:
25
  try:
26
  df = pd.read_csv(FILE_PATH, encoding='big5')
27
- print("✅ 資料讀取成功 (big5)")
28
- except Exception as e:
29
- print(f"❌ 讀取失敗: {e}")
30
  df = pd.DataFrame()
31
- except Exception as e:
32
- print(f"❌ 未知錯誤: {e}")
33
  df = pd.DataFrame()
34
 
35
  # --- 2. 資料清洗 ---
36
  if not df.empty:
37
- # 欄位名稱標準化
38
  df.columns = [str(c).strip().replace('\ufeff', '') for c in df.columns]
39
-
40
- # 自動對應欄位
41
  for col in df.columns:
42
  if '主旨' in col or '內容' in col: df.rename(columns={col: '主旨'}, inplace=True)
43
  if '窗口' in col or '單位' in col: df.rename(columns={col: '收文窗口'}, inplace=True)
44
 
45
- # 移除空值與重複值 (減少資料量,提升速度)
46
- original_len = len(df)
47
  df = df.dropna(subset=['主旨', '收文窗口'])
48
- # df = df.drop_duplicates(subset=['主旨']) # 視情況開啟,可進一步瘦身
49
- print(f"📊 有效資料共 {len(df)} 筆 (已過濾無效資料)")
 
 
50
  else:
51
  print("❌ 資料表是空的!")
 
 
52
 
53
- # --- 3. 載入模型 (關鍵修改:換成輕量版 Small 模型) ---
54
- print("🧠 正在載入輕量版模型 (BAAI/bge-small-zh-v1.5)...")
55
- # 改用 Small 版本,速度快 3 倍,記憶體佔用極低
56
- model_name = 'BAAI/bge-small-zh-v1.5'
57
- model = SentenceTransformer(model_name)
58
 
59
- if not df.empty and '主旨' in df.columns:
60
- print(f"⚡ 正在建立索引 (共 {len(df)} 筆)...")
61
- corpus = df['主旨'].tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- try:
64
- # ▼▼▼ 極致優化:batch_size=16 ▼▼▼
65
- corpus_embeddings = model.encode(
66
- corpus,
67
- batch_size=16, # 每次只處理 16 筆,極度安全
68
- show_progress_bar=True,
69
- convert_to_tensor=True,
70
- normalize_embeddings=True # 正規化,提升比對準度
71
- )
72
- print("✅ 索引建立完成!系統已就緒。")
73
-
74
- # 強制清理記憶體
75
- gc.collect()
76
 
77
- except Exception as e:
78
- print(f"❌ 建立索引失敗: {e}")
79
- corpus_embeddings = None
80
- else:
81
- print("❌ 無法建立索引")
82
- corpus = []
83
- corpus_embeddings = None
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- # --- 4. 定義搜尋功能 ---
86
  def search_department(query):
 
87
  if corpus_embeddings is None:
88
  return "⚠️ 系統初始化失敗,請檢查 Logs。"
89
 
90
  if not query.strip():
91
  return "請輸入公文主旨..."
92
 
93
- # 搜尋
94
  query_embedding = model.encode(query, convert_to_tensor=True)
95
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
96
 
97
- # 取前 3 名
98
  top_k = min(3, len(corpus))
99
  top_results = torch.topk(cos_scores, k=top_k)
100
 
@@ -106,13 +123,9 @@ def search_department(query):
106
  row = df.iloc[idx]
107
  score_val = score.item()
108
 
109
- # 信心度 (Small 模型的門檻稍微調整)
110
- if score_val > 0.7:
111
- confidence = "⭐⭐⭐ 極高"
112
- elif score_val > 0.55:
113
- confidence = "⭐⭐ 高"
114
- else:
115
- confidence = "⭐ 參考"
116
 
117
  output_text += f"【推薦單位】:{row['收文窗口']}\n"
118
  output_text += f" - 歷史案例:{row['主旨']}\n"
@@ -121,13 +134,13 @@ def search_department(query):
121
 
122
  return output_text
123
 
124
- # --- 5. 啟動介面 ---
125
  iface = gr.Interface(
126
  fn=search_department,
127
  inputs=gr.Textbox(lines=3, placeholder="請輸入公文主旨..."),
128
  outputs=gr.Textbox(lines=12, label="AI 判決建議"),
129
  title=SYSTEM_TITLE,
130
- description=f"系統狀態:{'🟢 正常 (輕量模式)' if corpus_embeddings is not None else '🔴 異常'}\n資料庫收錄:{len(df)} 筆歷史資料",
131
  examples=[["檢送本署彙整人工生殖機構之捐贈生殖細胞使用情形"], ["函轉衛生局關於流感疫苗接種計畫"]]
132
  )
133
 
 
4
  import torch
5
  import os
6
  import sys
7
+ import gc
8
+ import time
9
 
10
  # --- 系統設定 ---
11
  SYSTEM_TITLE = "花蓮慈濟醫院公文輔助判決系統"
12
  FILE_PATH = 'data.csv'
13
+ # ▼▼▼ 關鍵:定義索引檔案儲存路徑 ▼▼▼
14
+ INDEX_FILE = 'corpus_embeddings.pt'
15
 
16
+ # --- 1. 讀取資料 ---
17
+ print("🚀 正在啟動快取模式...")
18
 
19
  if not os.path.exists(FILE_PATH):
20
  print(f"❌ 錯誤:找不到 {FILE_PATH}")
21
  sys.exit(1)
22
 
23
  try:
24
+ # 讀取檔案 (維持 CP950 容錯)
25
  df = pd.read_csv(FILE_PATH, encoding='cp950')
 
26
  except UnicodeDecodeError:
27
  try:
28
  df = pd.read_csv(FILE_PATH, encoding='big5')
29
+ except Exception:
 
 
30
  df = pd.DataFrame()
31
+ except Exception:
 
32
  df = pd.DataFrame()
33
 
34
  # --- 2. 資料清洗 ---
35
  if not df.empty:
 
36
  df.columns = [str(c).strip().replace('\ufeff', '') for c in df.columns]
 
 
37
  for col in df.columns:
38
  if '主旨' in col or '內容' in col: df.rename(columns={col: '主旨'}, inplace=True)
39
  if '窗口' in col or '單位' in col: df.rename(columns={col: '收文窗口'}, inplace=True)
40
 
41
+ df['主旨'] = df['主旨'].astype(str)
42
+ df['收文窗口'] = df['收文窗口'].astype(str)
43
  df = df.dropna(subset=['主旨', '收文窗口'])
44
+
45
+ corpus = df['主旨'].tolist()
46
+ total_records = len(corpus)
47
+ print(f"📊 載入全量資料: {total_records} 筆")
48
  else:
49
  print("❌ 資料表是空的!")
50
+ corpus = []
51
+ total_records = 0
52
 
53
+ # --- 3. 載入模型與建立索引 (關鍵:檢查快取) ---
 
 
 
 
54
 
55
+ # 檢查模型是否已經載入
56
+ model = None
57
+ try:
58
+ print("🧠 正在載入模型 (BAAI/bge-small-zh-v1.5)...")
59
+ model = SentenceTransformer('BAAI/bge-small-zh-v1.5')
60
+ except Exception as e:
61
+ print(f"❌ 模型載入失敗: {e}")
62
+
63
+ corpus_embeddings = None
64
+
65
+ if total_records > 0 and model is not None:
66
+ if os.path.exists(INDEX_FILE):
67
+ # 快取存在,直接載入,快速啟動!
68
+ print(f"⚡ 偵測到快取檔案 ({INDEX_FILE}),正在秒速載入...")
69
+ try:
70
+ corpus_embeddings = torch.load(INDEX_FILE)
71
+ print("✅ 索引載入完成,系統秒速啟動!")
72
+ except Exception as e:
73
+ print(f"❌ 快取檔案損壞,將重新計算索引。錯誤: {e}")
74
+ corpus_embeddings = None # 設為 None 重新計算
75
 
76
+ if corpus_embeddings is None:
77
+ # 第一次啟動或快取損壞,進行耗時的計算
78
+ print(f"🔥 第一次啟動或快取失效,開始分批計算索引 (這需要約 2-4 分鐘)...")
79
+ chunk_size = 500
80
+ embeddings_chunks = []
81
+ start_time = time.time()
 
 
 
 
 
 
 
82
 
83
+ try:
84
+ for i in range(0, total_records, chunk_size):
85
+ batch = corpus[i : i + chunk_size]
86
+ batch_emb = model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
87
+ embeddings_chunks.append(batch_emb)
88
+ print(f" -> 已處理 {min(i + chunk_size, total_records)} / {total_records} 筆...")
89
+ gc.collect()
90
+
91
+ # 合併與儲存
92
+ print("🔗 正在合併並儲存索引...")
93
+ corpus_embeddings = torch.cat(embeddings_chunks)
94
+ torch.save(corpus_embeddings, INDEX_FILE) # ▼▼▼ 儲存索引到硬碟 ▼▼▼
95
+
96
+ end_time = time.time()
97
+ print(f"✅ 全量索引計算並儲存完成!耗時 {int(end_time - start_time)} 秒。")
98
+
99
+ except Exception as e:
100
+ print(f"❌ 索引計算失敗 (可能記憶體不足): {e}")
101
+ corpus_embeddings = None
102
 
103
+ # --- 4. 定義搜尋 ---
104
  def search_department(query):
105
+ # 這裡的邏輯與之前相同,不需要修改
106
  if corpus_embeddings is None:
107
  return "⚠️ 系統初始化失敗,請檢查 Logs。"
108
 
109
  if not query.strip():
110
  return "請輸入公文主旨..."
111
 
 
112
  query_embedding = model.encode(query, convert_to_tensor=True)
113
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
114
 
 
115
  top_k = min(3, len(corpus))
116
  top_results = torch.topk(cos_scores, k=top_k)
117
 
 
123
  row = df.iloc[idx]
124
  score_val = score.item()
125
 
126
+ if score_val > 0.7: confidence = "⭐⭐⭐ 極高"
127
+ elif score_val > 0.55: confidence = "⭐⭐ 高"
128
+ else: confidence = " 參考"
 
 
 
 
129
 
130
  output_text += f"【推薦單位】:{row['收文窗口']}\n"
131
  output_text += f" - 歷史案例:{row['主旨']}\n"
 
134
 
135
  return output_text
136
 
137
+ # --- 5. 介面 ---
138
  iface = gr.Interface(
139
  fn=search_department,
140
  inputs=gr.Textbox(lines=3, placeholder="請輸入公文主旨..."),
141
  outputs=gr.Textbox(lines=12, label="AI 判決建議"),
142
  title=SYSTEM_TITLE,
143
+ description=f"系統狀態:{'🟢 快取就緒' if corpus_embeddings is not None else '🔴 索引失敗'}\n資料庫完整收錄:{total_records} 筆歷史資料 (無刪減)",
144
  examples=[["檢送本署彙整人工生殖機構之捐贈生殖細胞使用情形"], ["函轉衛生局關於流感疫苗接種計畫"]]
145
  )
146