maxmon
commited on
Commit
•
1086ffd
1
Parent(s):
1743235
feat: init
Browse files- app.py +35 -0
- local_config.py +1 -0
- utils/anno/cls/__pycache__/text_classification.cpython-310.pyc +0 -0
- utils/anno/cls/text_classification.py +38 -0
- utils/anno/ner/__pycache__/entity_extract.cpython-310.pyc +0 -0
- utils/anno/ner/entity_extract.py +47 -0
- utils/api/__pycache__/google_trans.cpython-310.pyc +0 -0
- utils/api/chatglm.py +10 -0
- utils/api/google_trans.py +16 -0
- utils/auto_learn/cluster_text.py +100 -0
- utils/format/bio_2_json.py +49 -0
app.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
|
4 |
+
def auto_anno(txt, types, radio, need_trans=False):
|
5 |
+
if need_trans:
|
6 |
+
txt = en2cn(txt)
|
7 |
+
if radio == '文本分类':
|
8 |
+
result = text_classification(txt, types)
|
9 |
+
if radio == '实体抽取':
|
10 |
+
result = extract_named_entities(txt, types)
|
11 |
+
if need_trans:
|
12 |
+
result = f'{txt}\n{result}'
|
13 |
+
return result
|
14 |
+
|
15 |
+
input1 = gr.Textbox(lines=3, label="输入原句")
|
16 |
+
input2 = gr.Textbox(lines=3, label="输入类别")
|
17 |
+
output = gr.Textbox(label="输出结果")
|
18 |
+
radio = gr.Radio(["文本分类", "实体抽取"], label="算法类型")
|
19 |
+
checkbox = gr.Checkbox(label="翻译成中文")
|
20 |
+
|
21 |
+
# 读取数据
|
22 |
+
from utils.anno.cls.text_classification import text_classification
|
23 |
+
from utils.anno.ner.entity_extract import extract_named_entities
|
24 |
+
from utils.api.google_trans import en2cn
|
25 |
+
|
26 |
+
if __name__ == '__main__':
|
27 |
+
# # 多文本分类
|
28 |
+
# txts = open('data/cls/jd.csv', 'r', encoding='utf-8').read().split('\n')[1:]
|
29 |
+
# txts = [txt.split(',')[0] for txt in txts if txt != '']
|
30 |
+
|
31 |
+
# results = []
|
32 |
+
# for txt in txts:
|
33 |
+
# results.append(text_classification(txt, ['好评', '差评']))
|
34 |
+
demo = gr.Interface(fn=auto_anno, inputs=[input1, input2, radio, checkbox], outputs=[output])
|
35 |
+
demo.launch(share=True)
|
local_config.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
openai_key = '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'
|
utils/anno/cls/__pycache__/text_classification.cpython-310.pyc
ADDED
Binary file (1.25 kB). View file
|
|
utils/anno/cls/text_classification.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import sys
|
3 |
+
sys.path.append('.')
|
4 |
+
from local_config import openai_key
|
5 |
+
|
6 |
+
# Set up your API key
|
7 |
+
openai.api_key = openai_key
|
8 |
+
|
9 |
+
def text_classification(src_txt, type_arr):
|
10 |
+
system = f"你是一个聪明而且有百年经验的文本. 你的任务是从一段文本里面提取出相应的分类结果签。你的回答必须用统一的格式。文本用```符号分割。分类类型保存在一个数组里{type_arr}"
|
11 |
+
user = f"输入|```这个商品真垃圾```输出|"
|
12 |
+
assistant = "差评"
|
13 |
+
input = f"输入|```{src_txt}```输出|"
|
14 |
+
# Call the OpenAI API
|
15 |
+
completion = openai.ChatCompletion.create(
|
16 |
+
model="gpt-3.5-turbo",
|
17 |
+
messages=[
|
18 |
+
{"role": "system", "content": f"{system}"},
|
19 |
+
{"role": "user", "content": f"{user}"},
|
20 |
+
{"role": "assistant", "content": f"{assistant}"},
|
21 |
+
{"role": "user", "content": f"{input}"}
|
22 |
+
]
|
23 |
+
)
|
24 |
+
|
25 |
+
# Extract the output and parse the JSON array
|
26 |
+
content = completion.choices[0].message.content
|
27 |
+
return content
|
28 |
+
|
29 |
+
if __name__ == '__main__':
|
30 |
+
type_arr = ['好评', '差评']
|
31 |
+
txts = [
|
32 |
+
'这个商品真不错',
|
33 |
+
'用着不行',
|
34 |
+
'没用过这么好的东西'
|
35 |
+
]
|
36 |
+
for txt in txts:
|
37 |
+
result = text_classification(txt, type_arr)
|
38 |
+
print(txt, result)
|
utils/anno/ner/__pycache__/entity_extract.cpython-310.pyc
ADDED
Binary file (1.47 kB). View file
|
|
utils/anno/ner/entity_extract.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import json
|
3 |
+
import sys
|
4 |
+
sys.path.append('.')
|
5 |
+
from local_config import openai_key
|
6 |
+
|
7 |
+
# Set up your API key
|
8 |
+
openai.api_key = openai_key
|
9 |
+
|
10 |
+
def extract_named_entities(src_txt, type_arr):
|
11 |
+
system = f"你是一个聪明而且有百年经验的命名实体识别(NER)识别器. 你的任务是从一段文本里面提取出相应的实体并且给出标签。你的回答必须用统一的格式。文本用```符号分割。输出采用Json的格式并且标记实体在文本中的位置。实体类型保存在一个数组里{type_arr}"
|
12 |
+
user = f"输入|```皮卡丘神奇宝贝```输出|"
|
13 |
+
assistant = """[{"name": "皮卡丘", "type": "Person", "start": 0, "end": 3}, {"name": "神奇宝贝", "type": "物种", "start": 4, "end": 8}]"""
|
14 |
+
input = f"输入|```{src_txt}```输出|"
|
15 |
+
# Call the OpenAI API
|
16 |
+
completion = openai.ChatCompletion.create(
|
17 |
+
model="gpt-3.5-turbo",
|
18 |
+
messages=[
|
19 |
+
{"role": "system", "content": f"{system}"},
|
20 |
+
{"role": "user", "content": f"{user}"},
|
21 |
+
{"role": "assistant", "content": f"{assistant}"},
|
22 |
+
{"role": "user", "content": f"{input}"}
|
23 |
+
]
|
24 |
+
)
|
25 |
+
|
26 |
+
# Extract the output and parse the JSON array
|
27 |
+
content = completion.choices[0].message.content
|
28 |
+
print(content)
|
29 |
+
j = json.loads(content)
|
30 |
+
return j
|
31 |
+
|
32 |
+
if __name__ == '__main__':
|
33 |
+
# extract_named_entities("```汤姆每天都被杰瑞欺负,皮卡丘越来越想帮忙,竟然还总是被拒绝,心想难道我“皮大仙”这点能力都没有?而且,这货不是被虐狂吧```", ["Person", "物种"])
|
34 |
+
extract_named_entities('老百姓心新乡新闻网话说这几天新乡天气还好吧偷笑', ['代称', '行政区'])
|
35 |
+
# Tags: PER(人名), LOC(地点名), GPE(行政区名), ORG(机构名)
|
36 |
+
# Label Tag Meaning
|
37 |
+
# PER PER.NAM 名字(张三)
|
38 |
+
# PER.NOM 代称、类别名(穷人)
|
39 |
+
# LOC LOC.NAM 特指名称(紫玉山庄)
|
40 |
+
# LOC.NOM 泛称(大峡谷、宾馆)
|
41 |
+
# GPE GPE.NAM 行政区的名称(北京)
|
42 |
+
# ORG ORG.NAM 特定机构名称(通惠医院)
|
43 |
+
# ORG.NOM 泛指名称、统称(文艺公司)
|
44 |
+
# 原始标注 老百姓PER.NOM 新乡GPE.NAM
|
45 |
+
# gpt-3.5-turbo [{"name": "老百姓", "type": "代称", "start": 0, "end": 4}, {"name": "新乡新闻网", "type": "组织机构", "start": 4, "end": 10}, {"name": "新乡", "type": "行政区", "start": 12, "end": 14}, {"name": "天气", "type": "自然现象", "start": 14, "end": 16}]
|
46 |
+
# ERNIE-UIE {"text":"老百姓心新乡新闻网话说这几天新乡天气还好吧偷笑","result":[{"行政区":[{"text":"新乡","start":4,"end":6,"probability":0.589552328738506}]}]}
|
47 |
+
|
utils/api/__pycache__/google_trans.cpython-310.pyc
ADDED
Binary file (889 Bytes). View file
|
|
utils/api/chatglm.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
|
3 |
+
# 页面地址 https://fd7fa865d3f27cda69.gradio.live/
|
4 |
+
# 指定请求的数据
|
5 |
+
data = {'prompt': '清华大学地址'}
|
6 |
+
# 发送POST请求到API
|
7 |
+
response = requests.post('http://region-9.seetacloud.com:51661/', json=data)
|
8 |
+
# 获取预测结果
|
9 |
+
result = response.json()
|
10 |
+
print(result)
|
utils/api/google_trans.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
|
4 |
+
def en2cn(text):
|
5 |
+
return trans(text, 'en', 'zh-CN')
|
6 |
+
|
7 |
+
def trans(text, sl, tl):
|
8 |
+
temp_url = 'https://translate.googleapis.com/translate_a/single?client=gtx&sl={sl}&tl={tl}&dt=t&q={q}'
|
9 |
+
url = temp_url.format(q=text, sl=sl, tl=tl)
|
10 |
+
result = requests.get(url)
|
11 |
+
j = json.loads(result.content)
|
12 |
+
cn = ''.join([i[0] for i in j[0]])
|
13 |
+
return cn
|
14 |
+
|
15 |
+
if __name__ == '__main__':
|
16 |
+
print(en2cn('hello world'))
|
utils/auto_learn/cluster_text.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
from sklearn.decomposition import PCA
|
3 |
+
from sklearn.cluster import KMeans
|
4 |
+
from sklearn.metrics.pairwise import euclidean_distances
|
5 |
+
import openai
|
6 |
+
import numpy as np
|
7 |
+
# import matplotlib
|
8 |
+
# print(matplotlib.matplotlib_fname())
|
9 |
+
import sys
|
10 |
+
sys.path.append('.')
|
11 |
+
from local_config import openai_key
|
12 |
+
|
13 |
+
|
14 |
+
def cluster_text(text_list, n_clusters=20, openai_api_key=openai_key):
|
15 |
+
# Set OpenAI API key
|
16 |
+
openai.api_key = openai_api_key
|
17 |
+
model = "text-embedding-ada-002"
|
18 |
+
# Convert text_list to numerical data using OpenAI API
|
19 |
+
data = []
|
20 |
+
for text in text_list:
|
21 |
+
emb_req = openai.Embedding.create(input=[text], model=model)
|
22 |
+
embeddings = emb_req.data[0].embedding
|
23 |
+
data.append(embeddings)
|
24 |
+
data = np.array(data)
|
25 |
+
|
26 |
+
# Cluster the data
|
27 |
+
kmeans = KMeans(n_clusters=n_clusters)
|
28 |
+
kmeans.fit(data)
|
29 |
+
|
30 |
+
# Get the cluster centers
|
31 |
+
centers = kmeans.cluster_centers_
|
32 |
+
|
33 |
+
# Get the distances to each center
|
34 |
+
# distances = kmeans.transform(data)
|
35 |
+
distances = euclidean_distances(data, centers)
|
36 |
+
|
37 |
+
# Get the indices of the samples with the largest distance to their center
|
38 |
+
indices = np.argmax(distances, axis=0)
|
39 |
+
|
40 |
+
# Get the samples with the largest distance to their center
|
41 |
+
samples = []
|
42 |
+
seen_samples = set()
|
43 |
+
for i in indices:
|
44 |
+
sample = text_list[i]
|
45 |
+
if sample not in seen_samples:
|
46 |
+
samples.append(sample)
|
47 |
+
seen_samples.add(sample)
|
48 |
+
else:
|
49 |
+
sorted_indices = np.argsort(distances[:, i])
|
50 |
+
for j in sorted_indices[::-1]:
|
51 |
+
sample = text_list[j]
|
52 |
+
if sample not in seen_samples:
|
53 |
+
samples.append(sample)
|
54 |
+
seen_samples.add(sample)
|
55 |
+
break
|
56 |
+
|
57 |
+
# Return samples as list of strings
|
58 |
+
return samples
|
59 |
+
|
60 |
+
|
61 |
+
def plot_clusters(text_list, n_clusters=20, openai_api_key=openai_key):
|
62 |
+
# Set OpenAI API key
|
63 |
+
openai.api_key = openai_api_key
|
64 |
+
model = "text-embedding-ada-002"
|
65 |
+
# Convert text_list to numerical data using OpenAI API
|
66 |
+
data = []
|
67 |
+
for text in text_list:
|
68 |
+
emb_req = openai.Embedding.create(input=[text], model=model)
|
69 |
+
embeddings = emb_req.data[0].embedding
|
70 |
+
data.append(embeddings)
|
71 |
+
data = np.array(data)
|
72 |
+
|
73 |
+
# Cluster the data
|
74 |
+
kmeans = KMeans(n_clusters=n_clusters)
|
75 |
+
kmeans.fit(data)
|
76 |
+
|
77 |
+
# Reduce the dimensionality of the data
|
78 |
+
pca = PCA(n_components=2)
|
79 |
+
reduced_data = pca.fit_transform(data)
|
80 |
+
|
81 |
+
# Plot the reduced data
|
82 |
+
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_)
|
83 |
+
for i, text in enumerate(text_list):
|
84 |
+
plt.annotate(text, (reduced_data[i, 0], reduced_data[i, 1]))
|
85 |
+
plt.show()
|
86 |
+
|
87 |
+
|
88 |
+
if __name__ == "__main__":
|
89 |
+
test_data = [
|
90 |
+
'一百多和三十的也看不出什么区别,包装精美,质量应该不错。',
|
91 |
+
'质量很好 料子很不错 做工细致 样式好看 穿着很漂亮',
|
92 |
+
' 会卷的 建议买大的小的会卷 胖就别买了 没用',
|
93 |
+
'大差了 布料很差 我也不想多说',
|
94 |
+
'一点也不好,我买的东西拿都拿到快递员自己签收了还不给我,恶心恶心恶心,不要脸不要脸'
|
95 |
+
]
|
96 |
+
|
97 |
+
result = cluster_text(test_data, n_clusters=3)
|
98 |
+
plot_clusters(test_data, n_clusters=3)
|
99 |
+
|
100 |
+
print(result)
|
utils/format/bio_2_json.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def bio_2_json_one(anno_txt):
|
2 |
+
ls = anno_txt.split('\n')
|
3 |
+
text = ''
|
4 |
+
anno = []
|
5 |
+
now_label = ''
|
6 |
+
for i, l in enumerate(ls):
|
7 |
+
char, label = l.split('\t')
|
8 |
+
text += char
|
9 |
+
if 'B-' in label:
|
10 |
+
start = i
|
11 |
+
now_label = label.split('-')[1]
|
12 |
+
if label == 'O':
|
13 |
+
if now_label:
|
14 |
+
anno.append([start, i, text[start:i], now_label])
|
15 |
+
now_label = ''
|
16 |
+
start = 0
|
17 |
+
if now_label:
|
18 |
+
i += 1
|
19 |
+
anno.append([start, i, text[start:i], now_label])
|
20 |
+
return {'text': text, 'anno': anno}
|
21 |
+
|
22 |
+
|
23 |
+
def bit_2_json(txt):
|
24 |
+
anno_txts = txt.split('\n\n')
|
25 |
+
annos = []
|
26 |
+
for anno_txt in anno_txts:
|
27 |
+
if anno_txt == '':
|
28 |
+
continue
|
29 |
+
anno_j = bio_2_json_one(anno_txt)
|
30 |
+
annos.append(anno_j)
|
31 |
+
return annos
|
32 |
+
|
33 |
+
|
34 |
+
if __name__ == '__main__':
|
35 |
+
txt = '''你\tB-PER
|
36 |
+
是\tO
|
37 |
+
一\tO
|
38 |
+
个\tO
|
39 |
+
聪\tB-PER
|
40 |
+
明\tI-PER
|
41 |
+
的\tO
|
42 |
+
软\tB-ORG
|
43 |
+
件\tI-ORG
|
44 |
+
工\tI-ORG
|
45 |
+
程\tI-ORG
|
46 |
+
师\tI-ORG'''
|
47 |
+
# txt = open('data/ner/weibo_ner/dev.txt', 'r', encoding='utf-8').read()
|
48 |
+
annos = bit_2_json(txt)
|
49 |
+
print(annos)
|