HoneyTian commited on
Commit
e6fd0e8
1 Parent(s): 820797e
Files changed (6) hide show
  1. .gitignore +1 -1
  2. Dockerfile +3 -1
  3. install.sh +56 -0
  4. language_identification.md +13 -0
  5. main.py +39 -58
  6. requirements.txt +3 -2
.gitignore CHANGED
@@ -3,7 +3,7 @@
3
  .idea/
4
 
5
  #data/
6
- #pretrained_models/
7
  temp/
8
 
9
  **/cache/
 
3
  .idea/
4
 
5
  #data/
6
+ pretrained_models/
7
  temp/
8
 
9
  **/cache/
Dockerfile CHANGED
@@ -1,7 +1,7 @@
1
  # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
 
4
- FROM python:3.8
5
 
6
  WORKDIR /code
7
 
@@ -27,4 +27,6 @@ WORKDIR $HOME/app
27
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
  COPY --chown=user . $HOME/app
29
 
 
 
30
  CMD ["python", "main.py"]
 
1
  # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
 
4
+ FROM python:3.6
5
 
6
  WORKDIR /code
7
 
 
27
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
  COPY --chown=user . $HOME/app
29
 
30
+ RUN bash -c 'bash install.sh --stage 1 --stop_stage 1 --system_version ubuntu'
31
+
32
  CMD ["python", "main.py"]
install.sh ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # bash install.sh --stage 1 --stop_stage 1 --system_version centos
4
+
5
+ verbose=true;
6
+ stage=-1
7
+ stop_stage=2
8
+
9
+ work_dir="$(pwd)"
10
+
11
+
12
+ # parse options
13
+ while true; do
14
+ [ -z "${1:-}" ] && break; # break if there are no arguments
15
+ case "$1" in
16
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
17
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
18
+ old_value="(eval echo \\$$name)";
19
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
20
+ was_bool=true;
21
+ else
22
+ was_bool=false;
23
+ fi
24
+
25
+ # Set the variable to the right value-- the escaped quotes make it work if
26
+ # the option had spaces, like --cmd "queue.pl -sync y"
27
+ eval "${name}=\"$2\"";
28
+
29
+ # Check that Boolean-valued arguments are really Boolean.
30
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
31
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
32
+ exit 1;
33
+ fi
34
+ shift 2;
35
+ ;;
36
+
37
+ *) break;
38
+ esac
39
+ done
40
+
41
+
42
+ $verbose && echo "system_version: ${system_version}"
43
+
44
+ pretrained_models_dir="$(pwd)/pretrained_models"
45
+
46
+ mkdir -p "${pretrained_models_dir}"
47
+
48
+
49
+ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
50
+ $verbose && echo "stage 1: download fasttext models"
51
+ cd "${pretrained_models_dir}" || exit 1;
52
+
53
+ wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
54
+ wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz
55
+
56
+ fi
language_identification.md CHANGED
@@ -16,3 +16,16 @@ https://github.com/saffsd/langid.py/tree/master/langid/train
16
  4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
17
 
18
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
17
 
18
  ```
19
+
20
+
21
+ ### fasttext
22
+
23
+ 识别 176 种语言。
24
+ https://fasttext.cc/docs/en/language-identification.html
25
+
26
+
27
+ ### 参考
28
+
29
+ ```text
30
+ https://zhuanlan.zhihu.com/p/600245782
31
+ ```
main.py CHANGED
@@ -6,14 +6,12 @@ https://huggingface.co/spaces/sayakpaul/demo-docker-gradio
6
  import argparse
7
  import json
8
  import platform
9
- from typing import Tuple
10
 
 
 
11
  import gradio as gr
12
- import langid
13
  from langid.langid import LanguageIdentifier, model
14
- import matplotlib.pyplot as plt
15
- import numpy as np
16
- from PIL import Image
17
 
18
  from project_settings import project_path, temp_directory
19
 
@@ -30,29 +28,40 @@ def get_args():
30
  default=(project_path / "lang_id_examples.json").as_posix(),
31
  type=str
32
  )
 
 
 
 
 
33
  args = parser.parse_args()
34
  return args
35
 
36
 
37
- lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
 
38
 
39
 
40
  def click_lang_id_button(text: str, ground_true: str, model_name: str):
41
  global lang_id_identifier
 
42
 
43
  if model_name == "langid":
44
  label, prob = lang_id_identifier.classify(text)
 
 
 
 
45
  else:
46
  label = "model_name not available."
47
- prob = 0.0
48
- return label, round(prob, 4)
49
 
50
 
51
  def main():
52
  args = get_args()
53
 
54
  brief_description = """
55
- ### Language Identification
56
  """
57
 
58
  # description
@@ -63,56 +72,28 @@ def main():
63
  with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
64
  lang_id_examples = json.load(f)
65
 
66
- # ui
67
- with gr.Blocks() as blocks:
68
- gr.Markdown(value=brief_description)
69
-
70
- with gr.Row():
71
- with gr.Column(scale=5):
72
- with gr.Tabs():
73
- with gr.TabItem("lang_id"):
74
- gr.Markdown(value="")
75
-
76
- with gr.Row():
77
- with gr.Column(scale=1):
78
- lang_id_text = gr.Textbox(lines=2, max_lines=50, label="text")
79
- lang_id_ground_true = gr.Textbox(label="ground_true")
80
-
81
- lang_id_model_name = gr.Dropdown(choices=["langid"], value="langid", label="model_name")
82
- lang_id_button = gr.Button("run", variant="primary")
83
-
84
- with gr.Column(scale=1):
85
- lang_id_label = gr.Textbox(label="label")
86
- lang_id_prob = gr.Number(label="prob")
87
-
88
- gr.Examples(
89
- examples=lang_id_examples,
90
- inputs=[
91
- lang_id_text,
92
- lang_id_ground_true,
93
- lang_id_model_name,
94
- ],
95
- outputs=[lang_id_label, lang_id_prob],
96
- fn=click_lang_id_button
97
- )
98
-
99
- # click event
100
- lang_id_button.click(
101
- click_lang_id_button,
102
- inputs=[
103
- lang_id_text,
104
- lang_id_ground_true,
105
- lang_id_model_name,
106
- ],
107
- outputs=[lang_id_label, lang_id_prob],
108
- )
109
-
110
- gr.Markdown(value=description)
111
-
112
- blocks.queue().launch(
113
  share=False if platform.system() == "Windows" else False,
114
- server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
115
- server_port=7860
116
  )
117
  return
118
 
 
6
  import argparse
7
  import json
8
  import platform
 
9
 
10
+ import fasttext
11
+ from fasttext.FastText import load_model, _FastText
12
  import gradio as gr
13
+ from gradio import inputs, outputs
14
  from langid.langid import LanguageIdentifier, model
 
 
 
15
 
16
  from project_settings import project_path, temp_directory
17
 
 
28
  default=(project_path / "lang_id_examples.json").as_posix(),
29
  type=str
30
  )
31
+ parser.add_argument(
32
+ "--fasttext_model",
33
+ default=(project_path / "pretrained_models/lid.176.bin").as_posix(),
34
+ type=str
35
+ )
36
  args = parser.parse_args()
37
  return args
38
 
39
 
40
+ lang_id_identifier: LanguageIdentifier = None
41
+ fasttext_model: _FastText = None
42
 
43
 
44
  def click_lang_id_button(text: str, ground_true: str, model_name: str):
45
  global lang_id_identifier
46
+ global fasttext_model
47
 
48
  if model_name == "langid":
49
  label, prob = lang_id_identifier.classify(text)
50
+ elif model_name == "fasttext":
51
+ label, prob = fasttext_model.predict(text, k=1)
52
+ label = label[0][9:]
53
+ prob = prob[0]
54
  else:
55
  label = "model_name not available."
56
+ prob = -1
57
+ return label, str(round(prob, 4))
58
 
59
 
60
  def main():
61
  args = get_args()
62
 
63
  brief_description = """
64
+ Language Identification
65
  """
66
 
67
  # description
 
72
  with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
73
  lang_id_examples = json.load(f)
74
 
75
+ global lang_id_identifier
76
+ global fasttext_model
77
+ lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
78
+ fasttext_model = fasttext.load_model(args.fasttext_model)
79
+
80
+ blocks = gr.Interface(
81
+ click_lang_id_button,
82
+ inputs=[
83
+ inputs.Textbox(lines=3, label="text"),
84
+ inputs.Textbox(label="ground_true"),
85
+ inputs.Dropdown(choices=["langid", "fasttext"], default="langid", label="model_name"),
86
+ ],
87
+ outputs=[
88
+ outputs.Textbox(label="label"),
89
+ outputs.Textbox(label="prob"),
90
+ ],
91
+ examples=lang_id_examples,
92
+ description=brief_description
93
+ )
94
+
95
+ blocks.launch(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  share=False if platform.system() == "Windows" else False,
 
 
97
  )
98
  return
99
 
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
- gradio==4.28.3
2
- langid==1.1.6
 
 
1
+ gradio==2.1.1
2
+ langid==1.1.6
3
+ fasttext==0.9.2