Charles Chan commited on
Commit
a054c10
·
1 Parent(s): 630d3f4
Files changed (2) hide show
  1. app.py +5 -6
  2. requirements.txt +1 -0
app.py CHANGED
@@ -4,14 +4,13 @@ from langchain_community.llms import HuggingFaceHub
4
  from langchain_community.embeddings import SentenceTransformerEmbeddings
5
  from langchain_community.vectorstores import FAISS
6
  from datasets import load_dataset
7
- from transformers import pipeline
8
 
9
  # 使用 進擊的巨人 数据集
10
  try:
11
- converter = pipeline("translation_zh_tw_zh_cn")
12
  dataset = load_dataset("rorubyy/attack_on_titan_wiki_chinese")
13
- answer_list = [converter(example["Answer"])[0]["translation_text"] for example in dataset["train"]]
14
-
15
  except Exception as e:
16
  st.error(f"读取数据集失败:{e}")
17
  st.stop()
@@ -82,9 +81,9 @@ with col3:
82
  random_index = random.randint(0, dataset_size - 1)
83
  # 读取随机问题
84
  random_question = dataset["train"][random_index]["Question"]
85
- random_question = converter(random_question)[0]["translation_text"]
86
  origin_answer = dataset["train"][random_index]["Answer"]
87
- origin_answer = converter(origin_answer)[0]["translation_text"]
88
  print('[]' + str(random_index) + '/' + str(dataset_size) + ']random_question: ' + random_question)
89
  print('origin_answer: ' + origin_answer)
90
 
 
4
  from langchain_community.embeddings import SentenceTransformerEmbeddings
5
  from langchain_community.vectorstores import FAISS
6
  from datasets import load_dataset
7
+ from opencc import OpenCC
8
 
9
  # 使用 進擊的巨人 数据集
10
  try:
11
+ converter = OpenCC('tw2s.json') # 'tw2s.json' 表示繁体中文到简体中文的转换
12
  dataset = load_dataset("rorubyy/attack_on_titan_wiki_chinese")
13
+ answer_list = [converter.convert(example["Answer"]) for example in dataset["train"]]
 
14
  except Exception as e:
15
  st.error(f"读取数据集失败:{e}")
16
  st.stop()
 
81
  random_index = random.randint(0, dataset_size - 1)
82
  # 读取随机问题
83
  random_question = dataset["train"][random_index]["Question"]
84
+ random_question = converter.convert(random_question)
85
  origin_answer = dataset["train"][random_index]["Answer"]
86
+ origin_answer = converter.convert(origin_answer)
87
  print('[]' + str(random_index) + '/' + str(dataset_size) + ']random_question: ' + random_question)
88
  print('origin_answer: ' + origin_answer)
89
 
requirements.txt CHANGED
@@ -6,3 +6,4 @@ langchain-huggingface
6
  sentence_transformers
7
  faiss-cpu
8
  datasets
 
 
6
  sentence_transformers
7
  faiss-cpu
8
  datasets
9
+ opencc-python-reimplemented