Norod78 commited on
Commit
34f0487
·
1 Parent(s): 5dcd10b

Hebrew book summary generator

Browse files
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: HebrewBookSummaries
3
- emoji: 🐠
4
  colorFrom: green
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.29.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: תקצירי ספרים
3
+ emoji: 📚
4
  colorFrom: green
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.28.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import pipeline, set_seed, AutoTokenizer, AutoModelForCausalLM
4
+ import random
5
+
6
+ title = "מחולל תקצירי ספרים מבוסס ג׳פיטי-נאו פצפון"
7
+ article = "מודל השפה אומן על ידי <a href=\"https://linktr.ee/Norod78\">דורון אדלר</a>"
8
+ description = "<p>Fine tuned <a href=\"https://huggingface.co/Norod78/hebrew-gpt_neo-tiny\">Norod78/hebrew-gpt_neo-tiny</a> upon a book summary dataset<p>"
9
+
10
+ examples = [
11
+ ['אם מתחשק לכם לפעמים'],
12
+ ["מחוללי הטקסט / "],
13
+ ['האדם האחרון עלי אדמות ישב לבד בחדרו כשלפתע נשמע דפיקה'],
14
+ ]
15
+
16
+ model_id = "./hebrew-gpt_neo-tiny-HebrewBookSummaries"
17
+
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ n_gpu = 0 if torch.cuda.is_available()==False else torch.cuda.device_count()
20
+ tokenizer = AutoTokenizer.from_pretrained(model_id, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>', unknown_token = '<|unknown|>')
21
+ model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
22
+ model.resize_token_embeddings(len(tokenizer))
23
+ text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
24
+
25
+ max_length = 96
26
+ top_k = 40
27
+ top_p = 0.92
28
+ temperature = 0.8
29
+ max_seed = (2**32)-1
30
+ global_seed = random.randint(0, max_seed)
31
+
32
+ def text_generation(input_text = ''):
33
+ global global_seed
34
+ global_seed = global_seed + 1
35
+ if global_seed >= max_seed:
36
+ global_seed = 0
37
+ if input_text == None or len(input_text) == 0:
38
+ input_text = "<|startoftext|>"
39
+ set_seed(global_seed)
40
+ generated_text = text_generator(input_text,
41
+ max_length=max_length,
42
+ top_k=top_k,
43
+ top_p=top_p,
44
+ temperature=temperature,
45
+ do_sample=True,
46
+ repetition_penalty=2.0,
47
+ num_return_sequences=1)
48
+ parsed_text = generated_text[0]["generated_text"].replace("<|startoftext|>", "").replace("\r","\n").replace("\n\n", "\n").replace("\t", " ").replace("<|pad|>", " * ").replace("\"\"", "\"").strip()
49
+ print("parsed_text = \"" + parsed_text + "\" (seed = " + str(global_seed) + ")")
50
+ return parsed_text
51
+ gr.Interface(
52
+ text_generation,
53
+ inputs=gr.Textbox(lines=1, label=".הזינו פה את מילות הפתיחה של הטקסט, בחרו את אחת מן הדוגמאות המוכנות או השאירו ריק. מה שבא לכם. בכל לחיצה על סאבמיט, יווצר טקסט אחר", elem_id="input_text"),
54
+ outputs=gr.Textbox(type="text", label="פה מופיע הטקסט שהמחולל יוצר", elem_id="output_text"),
55
+ css="#output_text{direction: rtl} #input_text{direction: rtl}",
56
+ title=title,
57
+ description=description,
58
+ article=article,
59
+ examples=examples,
60
+ allow_flagging='never',
61
+ ).launch()
hebrew-gpt_neo-tiny-HebrewBookSummaries/added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 50257,
3
+ "<|pad|>": 50260,
4
+ "<|startoftext|>": 50258,
5
+ "<|unknown|>": 50259
6
+ }
hebrew-gpt_neo-tiny-HebrewBookSummaries/config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Norod78/hebrew-gpt_neo-tiny",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPTNeoForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "attention_layers": [
9
+ "global",
10
+ "global",
11
+ "global",
12
+ "global",
13
+ "global",
14
+ "global"
15
+ ],
16
+ "attention_types": [
17
+ [
18
+ [
19
+ "global"
20
+ ],
21
+ 6
22
+ ]
23
+ ],
24
+ "bos_token_id": 50256,
25
+ "classifier_dropout": 0.1,
26
+ "embed_dropout": 0.1,
27
+ "eos_token_id": 50256,
28
+ "gradient_checkpointing": false,
29
+ "hidden_size": 768,
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": null,
32
+ "layer_norm_epsilon": 1e-05,
33
+ "max_position_embeddings": 1024,
34
+ "model_type": "gpt_neo",
35
+ "num_heads": 12,
36
+ "num_layers": 6,
37
+ "pad_token_id": 50256,
38
+ "resid_dropout": 0.1,
39
+ "summary_activation": null,
40
+ "summary_first_dropout": 0.1,
41
+ "summary_proj_to_labels": true,
42
+ "summary_type": "cls_index",
43
+ "summary_use_proj": true,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.29.1",
46
+ "use_cache": true,
47
+ "vocab_size": 50261,
48
+ "window_size": 256
49
+ }
hebrew-gpt_neo-tiny-HebrewBookSummaries/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "pad_token_id": 50256,
6
+ "transformers_version": "4.29.1"
7
+ }
hebrew-gpt_neo-tiny-HebrewBookSummaries/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
hebrew-gpt_neo-tiny-HebrewBookSummaries/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1710bf6e58073a6715acc83a80391274813046322f9812f0ef56844a2ef5881
3
+ size 333929547
hebrew-gpt_neo-tiny-HebrewBookSummaries/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": "<unk>"
24
+ }
hebrew-gpt_neo-tiny-HebrewBookSummaries/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
hebrew-gpt_neo-tiny-HebrewBookSummaries/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|startoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "do_lower_case": false,
14
+ "eos_token": {
15
+ "__type": "AddedToken",
16
+ "content": "<|endoftext|>",
17
+ "lstrip": false,
18
+ "normalized": true,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "errors": "replace",
23
+ "full_tokenizer_file": null,
24
+ "max_len": 1024,
25
+ "model_max_length": 1024,
26
+ "pad_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|pad|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "tokenizer_class": "GPT2Tokenizer",
35
+ "unk_token": {
36
+ "__type": "AddedToken",
37
+ "content": "<|endoftext|>",
38
+ "lstrip": false,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false
42
+ },
43
+ "unknown_token": "<|unknown|>"
44
+ }
hebrew-gpt_neo-tiny-HebrewBookSummaries/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ tokenizers
5
+