Spaces:

maxmon
/

auto_anno

Runtime error

App Files Files Community

maxmon commited on May 10, 2023

Commit

1086ffd

1 Parent(s): 1743235

feat: init

Browse files

Files changed (11) hide show

app.py +35 -0
local_config.py +1 -0
utils/anno/cls/__pycache__/text_classification.cpython-310.pyc +0 -0
utils/anno/cls/text_classification.py +38 -0
utils/anno/ner/__pycache__/entity_extract.cpython-310.pyc +0 -0
utils/anno/ner/entity_extract.py +47 -0
utils/api/__pycache__/google_trans.cpython-310.pyc +0 -0
utils/api/chatglm.py +10 -0
utils/api/google_trans.py +16 -0
utils/auto_learn/cluster_text.py +100 -0
utils/format/bio_2_json.py +49 -0

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import gradio as gr
+import json
+def auto_anno(txt, types, radio, need_trans=False):
+  if need_trans:
+    txt = en2cn(txt)
+  if radio == '文本分类':
+    result = text_classification(txt, types)
+  if radio == '实体抽取':
+    result = extract_named_entities(txt, types)
+  if need_trans:
+    result = f'{txt}\n{result}'
+  return result
+input1 = gr.Textbox(lines=3, label="输入原句")
+input2 = gr.Textbox(lines=3, label="输入类别")
+output = gr.Textbox(label="输出结果")
+radio = gr.Radio(["文本分类", "实体抽取"], label="算法类型")
+checkbox = gr.Checkbox(label="翻译成中文")
+# 读取数据
+from utils.anno.cls.text_classification import text_classification
+from utils.anno.ner.entity_extract import extract_named_entities
+from utils.api.google_trans import en2cn
+if __name__ == '__main__':
+  # # 多文本分类
+  # txts = open('data/cls/jd.csv', 'r', encoding='utf-8').read().split('\n')[1:]
+  # txts = [txt.split(',')[0] for txt in txts if txt != '']
+  # results = []
+  # for txt in txts:
+  #   results.append(text_classification(txt, ['好评', '差评']))
+  demo = gr.Interface(fn=auto_anno, inputs=[input1, input2, radio, checkbox], outputs=[output])
+  demo.launch(share=True)

local_config.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ openai_key = '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'

utils/anno/cls/__pycache__/text_classification.cpython-310.pyc ADDED Viewed

Binary file (1.25 kB). View file

utils/anno/cls/text_classification.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import openai
+import sys
+sys.path.append('.')
+from local_config import openai_key
+# Set up your API key
+openai.api_key = openai_key
+def text_classification(src_txt, type_arr):
+    system = f"你是一个聪明而且有百年经验的文本. 你的任务是从一段文本里面提取出相应的分类结果签。你的回答必须用统一的格式。文本用```符号分割。分类类型保存在一个数组里{type_arr}"
+    user = f"输入|```这个商品真垃圾```输出|"
+    assistant = "差评"
+    input = f"输入|```{src_txt}```输出|"
+    # Call the OpenAI API
+    completion = openai.ChatCompletion.create(
+                    model="gpt-3.5-turbo",
+                    messages=[
+                        {"role": "system", "content": f"{system}"},
+                        {"role": "user", "content": f"{user}"},
+                        {"role": "assistant", "content": f"{assistant}"},
+                        {"role": "user", "content": f"{input}"}
+                    ]
+                )
+    # Extract the output and parse the JSON array
+    content = completion.choices[0].message.content
+    return content
+if __name__ == '__main__':
+    type_arr = ['好评', '差评']
+    txts = [
+        '这个商品真不错',
+        '用着不行',
+        '没用过这么好的东西'
+    ]
+    for txt in txts:
+        result = text_classification(txt, type_arr)
+        print(txt, result)

utils/anno/ner/__pycache__/entity_extract.cpython-310.pyc ADDED Viewed

Binary file (1.47 kB). View file

utils/anno/ner/entity_extract.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import openai
+import json
+import sys
+sys.path.append('.')
+from local_config import openai_key
+# Set up your API key
+openai.api_key = openai_key
+def extract_named_entities(src_txt, type_arr):
+    system = f"你是一个聪明而且有百年经验的命名实体识别（NER）识别器. 你的任务是从一段文本里面提取出相应的实体并且给出标签。你的回答必须用统一的格式。文本用```符号分割。输出采用Json的格式并且标记实体在文本中的位置。实体类型保存在一个数组里{type_arr}"
+    user = f"输入|```皮卡丘神奇宝贝```输出|"
+    assistant = """[{"name": "皮卡丘", "type": "Person", "start": 0, "end": 3}, {"name": "神奇宝贝", "type": "物种", "start": 4, "end": 8}]"""
+    input = f"输入|```{src_txt}```输出|"
+    # Call the OpenAI API
+    completion = openai.ChatCompletion.create(
+                    model="gpt-3.5-turbo",
+                    messages=[
+                        {"role": "system", "content": f"{system}"},
+                        {"role": "user", "content": f"{user}"},
+                        {"role": "assistant", "content": f"{assistant}"},
+                        {"role": "user", "content": f"{input}"}
+                    ]
+                )
+    # Extract the output and parse the JSON array
+    content = completion.choices[0].message.content
+    print(content)
+    j = json.loads(content)
+    return j
+if __name__ == '__main__':
+    # extract_named_entities("```汤姆每天都被杰瑞欺负，皮卡丘越来越想帮忙，竟然还总是被拒绝，心想难道我“皮大仙”这点能力都没有？而且，这货不是被虐狂吧```", ["Person", "物种"])
+    extract_named_entities('老百姓心新乡新闻网话说这几天新乡天气还好吧偷笑', ['代称', '行政区'])
+    # Tags: PER(人名), LOC(地点名), GPE(行政区名), ORG(机构名)
+    # Label   Tag Meaning
+    # PER PER.NAM 名字（张三）
+    # PER.NOM 代称、类别名（穷人）
+    # LOC LOC.NAM 特指名称（紫玉山庄）
+    # LOC.NOM 泛称（大峡谷、宾馆）
+    # GPE GPE.NAM 行政区的名称（北京）
+    # ORG ORG.NAM 特定机构名称（通惠医院）
+    # ORG.NOM 泛指名称、统称（文艺公司）
+    # 原始标注 老百姓PER.NOM 新乡GPE.NAM
+    # gpt-3.5-turbo [{"name": "老百姓", "type": "代称", "start": 0, "end": 4}, {"name": "新乡新闻网", "type": "组织机构", "start": 4, "end": 10}, {"name": "新乡", "type": "行政区", "start": 12, "end": 14}, {"name": "天气", "type": "自然现象", "start": 14, "end": 16}]
+    # ERNIE-UIE {"text":"老百姓心新乡新闻网话说这几天新乡天气还好吧偷笑","result":[{"行政区":[{"text":"新乡","start":4,"end":6,"probability":0.589552328738506}]}]}

utils/api/__pycache__/google_trans.cpython-310.pyc ADDED Viewed

Binary file (889 Bytes). View file

utils/api/chatglm.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import requests
+# 页面地址 https://fd7fa865d3f27cda69.gradio.live/
+# 指定请求的数据
+data = {'prompt': '清华大学地址'}
+# 发送POST请求到API
+response = requests.post('http://region-9.seetacloud.com:51661/', json=data)
+# 获取预测结果
+result = response.json()
+print(result)

utils/api/google_trans.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import requests
+import json
+def en2cn(text):
+    return trans(text, 'en', 'zh-CN')
+def trans(text, sl, tl):
+    temp_url = 'https://translate.googleapis.com/translate_a/single?client=gtx&sl={sl}&tl={tl}&dt=t&q={q}'
+    url = temp_url.format(q=text, sl=sl, tl=tl)
+    result = requests.get(url)
+    j = json.loads(result.content)
+    cn = ''.join([i[0] for i in j[0]])
+    return cn
+if __name__ == '__main__':
+    print(en2cn('hello world'))

utils/auto_learn/cluster_text.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import euclidean_distances
+import openai
+import numpy as np
+# import matplotlib
+# print(matplotlib.matplotlib_fname())
+import sys
+sys.path.append('.')
+from local_config import openai_key
+def cluster_text(text_list, n_clusters=20, openai_api_key=openai_key):
+    # Set OpenAI API key
+    openai.api_key = openai_api_key
+    model = "text-embedding-ada-002"
+    # Convert text_list to numerical data using OpenAI API
+    data = []
+    for text in text_list:
+        emb_req = openai.Embedding.create(input=[text], model=model)
+        embeddings = emb_req.data[0].embedding
+        data.append(embeddings)
+    data = np.array(data)
+    # Cluster the data
+    kmeans = KMeans(n_clusters=n_clusters)
+    kmeans.fit(data)
+    # Get the cluster centers
+    centers = kmeans.cluster_centers_
+    # Get the distances to each center
+    # distances = kmeans.transform(data)
+    distances = euclidean_distances(data, centers)
+    # Get the indices of the samples with the largest distance to their center
+    indices = np.argmax(distances, axis=0)
+    # Get the samples with the largest distance to their center
+    samples = []
+    seen_samples = set()
+    for i in indices:
+        sample = text_list[i]
+        if sample not in seen_samples:
+            samples.append(sample)
+            seen_samples.add(sample)
+        else:
+            sorted_indices = np.argsort(distances[:, i])
+            for j in sorted_indices[::-1]:
+                sample = text_list[j]
+                if sample not in seen_samples:
+                    samples.append(sample)
+                    seen_samples.add(sample)
+                    break
+    # Return samples as list of strings
+    return samples
+def plot_clusters(text_list, n_clusters=20, openai_api_key=openai_key):
+    # Set OpenAI API key
+    openai.api_key = openai_api_key
+    model = "text-embedding-ada-002"
+    # Convert text_list to numerical data using OpenAI API
+    data = []
+    for text in text_list:
+        emb_req = openai.Embedding.create(input=[text], model=model)
+        embeddings = emb_req.data[0].embedding
+        data.append(embeddings)
+    data = np.array(data)
+    # Cluster the data
+    kmeans = KMeans(n_clusters=n_clusters)
+    kmeans.fit(data)
+    # Reduce the dimensionality of the data
+    pca = PCA(n_components=2)
+    reduced_data = pca.fit_transform(data)
+    # Plot the reduced data
+    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_)
+    for i, text in enumerate(text_list):
+        plt.annotate(text, (reduced_data[i, 0], reduced_data[i, 1]))
+    plt.show()
+if __name__ == "__main__":
+    test_data = [
+        '一百多和三十的也看不出什么区别，包装精美，质量应该不错。',
+        '质量很好 料子很不错 做工细致 样式好看 穿着很漂亮',
+        ' 会卷的    建议买大的小的会卷   胖就别买了       没用',
+        '大差了  布料很差  我也不想多说',
+        '一点也不好，我买的东西拿都拿到快递员自己签收了还不给我，恶心恶心恶心，不要脸不要脸'
+    ]
+    result = cluster_text(test_data, n_clusters=3)
+    plot_clusters(test_data, n_clusters=3)
+    print(result)

utils/format/bio_2_json.py ADDED Viewed

	@@ -0,0 +1,49 @@

+def bio_2_json_one(anno_txt):
+    ls = anno_txt.split('\n')
+    text = ''
+    anno = []
+    now_label = ''
+    for i, l in enumerate(ls):
+        char, label = l.split('\t')
+        text += char
+        if 'B-' in label:
+            start = i
+            now_label = label.split('-')[1]
+        if label == 'O':
+            if now_label:
+                anno.append([start, i, text[start:i], now_label])
+                now_label = ''
+                start = 0
+    if now_label:
+        i += 1
+        anno.append([start, i, text[start:i], now_label])
+    return {'text': text, 'anno': anno}
+def bit_2_json(txt):
+    anno_txts = txt.split('\n\n')
+    annos = []
+    for anno_txt in anno_txts:
+        if anno_txt == '':
+            continue
+        anno_j = bio_2_json_one(anno_txt)
+        annos.append(anno_j)
+    return annos
+if __name__ == '__main__':
+    txt = '''你\tB-PER
+是\tO
+一\tO
+个\tO
+聪\tB-PER
+明\tI-PER
+的\tO
+软\tB-ORG
+件\tI-ORG
+工\tI-ORG
+程\tI-ORG
+师\tI-ORG'''
+    # txt = open('data/ner/weibo_ner/dev.txt', 'r', encoding='utf-8').read()
+    annos = bit_2_json(txt)
+    print(annos)