File size: 16,326 Bytes
c64a874
 
 
 
 
a03361e
 
 
 
 
 
c64a874
 
a03361e
 
 
 
 
 
67ed819
a03361e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67ed819
a03361e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c64a874
a03361e
 
 
 
 
 
 
 
 
c64a874
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a58a516
 
c64a874
 
 
 
 
 
 
 
a58a516
 
c64a874
 
 
 
 
 
 
 
 
 
 
 
a58a516
 
c64a874
a58a516
 
c64a874
 
a58a516
c64a874
 
 
 
 
 
 
 
a58a516
c64a874
a58a516
c64a874
 
 
112aad3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2434827
c64a874
2434827
c64a874
2434827
 
 
 
 
 
 
 
112aad3
2434827
 
112aad3
2434827
 
 
112aad3
 
 
 
2434827
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import gradio as gr
import requests
import os 

##Bloom Inference API

API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom" # Models on HF feature inference API which allows direct call and easy interface

HF_TOKEN = os.environ["HF_TOKEN"] # Add a token called HF_TOKEN under profile in settings access tokens.  Then copy it to the repository secret in this spaces settings panel.  os.environ reads from there.

# For headers the bearer token needs to incclude your HF_TOKEN value.
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

# Improved text generation function
def text_generate(prompt, generated_txt):
    # Initialize Thoughts variable to aggregate text
    Thoughts = ""

    # Debug: display the prompt
    Thoughts += f"Prompt: {prompt}\n"

    json_ = {
        "inputs": prompt,
        "parameters": {
            "top_p": 0.9,
            "temperature": 1.1,
            "return_full_text": True,
            "do_sample": True,
        },
        "options": {
            "use_cache": True,
            "wait_for_model": True,
        },
    }
    response = requests.post(API_URL, headers=headers, json=json_)
    output = response.json()

    # Debug: display the output
    Thoughts += f"Output: {output}\n"
    output_tmp = output[0]['generated_text']

    # Debug: display the output_tmp
    Thoughts += f"output_tmp is: {output_tmp}\n"
    solution = output_tmp.split("\nQ:")[0]

    # Debug: display the solution after splitting
    Thoughts += f"Final response after splits is: {solution}\n"

    if '\nOutput:' in solution:
        final_solution = solution.split("\nOutput:")[0]
        Thoughts += f"Response after removing output is: {final_solution}\n"
    elif '\n\n' in solution:
        final_solution = solution.split("\n\n")[0]
        Thoughts += f"Response after removing new line entries is: {final_solution}\n"
    else:
        final_solution = solution

    if len(generated_txt) == 0:
        display_output = final_solution
    else:
        display_output = generated_txt[:-len(prompt)] + final_solution

    new_prompt = final_solution[len(prompt):]

    # Debug: display the new prompt for the next cycle
    Thoughts += f"new prompt for next cycle is: {new_prompt}\n"
    Thoughts += f"display_output for printing on screen is: {display_output}\n"

    if len(new_prompt) == 0:
        temp_text = display_output[::-1]
        Thoughts += f"What is the last character of the sentence?: {temp_text[0]}\n"

        if temp_text[1] == '.':
            first_period_loc = temp_text[2:].find('.') + 1
            Thoughts += f"Location of last Period is: {first_period_loc}\n"
            new_prompt = display_output[-first_period_loc:-1]
            Thoughts += f"Not sending blank as prompt so new prompt for next cycle is: {new_prompt}\n"
        else:
            first_period_loc = temp_text.find('.')
            Thoughts += f"Location of last Period is: {first_period_loc}\n"
            new_prompt = display_output[-first_period_loc:-1]
            Thoughts += f"Not sending blank as prompt so new prompt for next cycle is: {new_prompt}\n"

        display_output = display_output[:-1]

    return display_output, new_prompt, Thoughts



    
# Text generation
def text_generate_old(prompt, generated_txt): 
  #Prints to debug the code
  print(f"*****Inside text_generate - Prompt is :{prompt}")
  json_ = {"inputs": prompt,
            "parameters":
            {
            "top_p": 0.9,
          "temperature": 1.1,
          #"max_new_tokens": 64,
          "return_full_text": True,
          "do_sample":True,
          }, 
          "options": 
          {"use_cache": True,
          "wait_for_model": True,
          },}

    
  response = requests.post(API_URL, headers=headers, json=json_)
  print(f"Response  is : {response}")
  output = response.json()
  print(f"output is : {output}") 
  output_tmp = output[0]['generated_text']
  print(f"output_tmp is: {output_tmp}")
  solution = output_tmp.split("\nQ:")[0]   
  print(f"Final response after splits is: {solution}")

    
  if '\nOutput:' in solution:
    final_solution = solution.split("\nOutput:")[0] 
    print(f"Response after removing output is: {final_solution}")
  elif '\n\n' in solution:
    final_solution = solution.split("\n\n")[0] 
    print(f"Response after removing new line entries is: {final_solution}")
  else:
    final_solution = solution
  if len(generated_txt) == 0 :
    display_output = final_solution
  else:
    display_output = generated_txt[:-len(prompt)] + final_solution

      
  new_prompt = final_solution[len(prompt):]
  print(f"New prompt for next cycle: {new_prompt}")
  print(f"Output final is : {display_output}")
  if len(new_prompt) == 0:
    temp_text = display_output[::-1]
    print(f"Last character of sentence: {temp_text[0]}")
    if temp_text[1] == '.':
      first_period_loc = temp_text[2:].find('.') + 1
      print(f"Location of last Period is: {first_period_loc}")
      new_prompt = display_output[-first_period_loc:-1]
      print(f"Not sending blank as prompt so new prompt for next cycle is : {new_prompt}")
    else:
      print("HERE")
      first_period_loc = temp_text.find('.')
      print(f"Last Period is : {first_period_loc}")
      new_prompt = display_output[-first_period_loc:-1]
      print(f"New prompt for next cycle is : {new_prompt}")
    display_output = display_output[:-1]
  return display_output, new_prompt  


Markdown = """


# 2023 Bloom Spaces

1. Model: https://huggingface.co/bigscience/bloom
2. Bloom Theme Generator: https://huggingface.co/spaces/awacke1/Write-Stories-Using-Bloom
3. Bloom Ghotwriter : https://huggingface.co/spaces/awacke1/Bloom.Generative.Writer
4. https://huggingface.co/spaces/awacke1/Bloom.Human.Feedback.File.Ops
5. https://huggingface.co/spaces/awacke1/04-AW-StorywriterwMem

🌸 πŸ”Ž Bloom Searcher πŸ” 🌸 

Tool design for Roots: [URL](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf).

Bloom on Wikipedia: [URL](https://en.wikipedia.org/wiki/BLOOM_(language_model)).

Bloom Video Playlist: [URL](https://www.youtube.com/playlist?list=PLHgX2IExbFouqnsIqziThlPCX_miiDq14).

Access full corpus check [URL](https://forms.gle/qyYswbEL5kA23Wu99).

Big Science - How to get started

Big Science is a 176B parameter new ML model that was trained on a set of datasets for Natural Language processing, and many other tasks that are not yet explored.. Below is the set of the papers, models, links, and datasets around big science which promises to be the best, most recent large model of its kind benefitting all science pursuits.

Model: https://huggingface.co/bigscience/bloom

Papers:
BLOOM: A 176B-Parameter Open-Access Multilingual Language Model https://arxiv.org/abs/2211.05100
Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism https://arxiv.org/abs/1909.08053
8-bit Optimizers via Block-wise Quantization https://arxiv.org/abs/2110.02861
Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation https://arxiv.org/abs/2108.12409
https://huggingface.co/models?other=doi:10.57967/hf/0003
217 Other Models optimizing use of bloom via specialization: https://huggingface.co/models?other=bloom

Datasets:
Universal Dependencies: https://paperswithcode.com/dataset/universal-dependencies
WMT 2014: https://paperswithcode.com/dataset/wmt-2014
The Pile: https://paperswithcode.com/dataset/the-pile
HumanEval: https://paperswithcode.com/dataset/humaneval
FLORES-101: https://paperswithcode.com/dataset/flores-101
CrowS-Pairs: https://paperswithcode.com/dataset/crows-pairs
WikiLingua: https://paperswithcode.com/dataset/wikilingua
MTEB: https://paperswithcode.com/dataset/mteb
xP3: https://paperswithcode.com/dataset/xp3
DiaBLa: https://paperswithcode.com/dataset/diabla

Evals:
https://github.com/AaronCWacker/evals

## Language Models πŸ—£οΈ
πŸ† Bloom sets new record for most performant and efficient AI model in science! 🌸
### Comparison of Large Language Models
| Model Name        | Model Size (in Parameters) |
| ----------------- | -------------------------- |
| BigScience-tr11-176B | 176 billion |
| GPT-3             | 175 billion               |
| OpenAI's DALL-E 2.0 | 500 million               |
| NVIDIA's Megatron | 8.3 billion               |
| Transformer-XL    | 250 million               |
| XLNet             | 210 million               |

## ChatGPT Datasets πŸ“š
- WebText
- Common Crawl
- BooksCorpus
- English Wikipedia
- Toronto Books Corpus
- OpenWebText

## ChatGPT Datasets - Details πŸ“š
- **WebText:** A dataset of web pages crawled from domains on the Alexa top 5,000 list. This dataset was used to pretrain GPT-2.
  - [WebText: A Large-Scale Unsupervised Text Corpus by Radford et al.](https://paperswithcode.com/dataset/webtext)
- **Common Crawl:** A dataset of web pages from a variety of domains, which is updated regularly. This dataset was used to pretrain GPT-3.
  - [Language Models are Few-Shot Learners](https://paperswithcode.com/dataset/common-crawl) by Brown et al.
- **BooksCorpus:** A dataset of over 11,000 books from a variety of genres.
  - [Scalable Methods for 8 Billion Token Language Modeling](https://paperswithcode.com/dataset/bookcorpus) by Zhu et al.
- **English Wikipedia:** A dump of the English-language Wikipedia as of 2018, with articles from 2001-2017.
  - [Improving Language Understanding by Generative Pre-Training](https://huggingface.co/spaces/awacke1/WikipediaUltimateAISearch?logs=build) Space for Wikipedia Search
- **Toronto Books Corpus:** A dataset of over 7,000 books from a variety of genres, collected by the University of Toronto.
  - [Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond](https://paperswithcode.com/dataset/bookcorpus) by Schwenk and Douze.
- **OpenWebText:** A dataset of web pages that were filtered to remove content that was likely to be low-quality or spammy. This dataset was used to pretrain GPT-3.
  - [Language Models are Few-Shot Learners](https://paperswithcode.com/dataset/openwebtext) by Brown et al.
  
## Big Science Model πŸš€
- πŸ“œ Papers:
  1. BLOOM: A 176B-Parameter Open-Access Multilingual Language Model [Paper](https://arxiv.org/abs/2211.05100)
  2. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism [Paper](https://arxiv.org/abs/1909.08053)
  3. 8-bit Optimizers via Block-wise Quantization [Paper](https://arxiv.org/abs/2110.02861)
  4. Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation [Paper](https://arxiv.org/abs/2108.12409)
  5. [Other papers related to Big Science](https://huggingface.co/models?other=doi:10.57967/hf/0003)
  6. [217 other models optimized for use with Bloom](https://huggingface.co/models?other=bloom)
  
- πŸ“š Datasets:
**Datasets:**
1. - **Universal Dependencies:** A collection of annotated corpora for natural language processing in a range of languages, with a focus on dependency parsing.
  - [Universal Dependencies official website.](https://universaldependencies.org/)
2. - **WMT 2014:** The fourth edition of the Workshop on Statistical Machine Translation, featuring shared tasks on translating between English and various other languages.
  - [WMT14 website.](http://www.statmt.org/wmt14/)
3. - **The Pile:** An English language corpus of diverse text, sourced from various places on the internet.
  - [The Pile official website.](https://pile.eleuther.ai/)
4. - **HumanEval:** A dataset of English sentences, annotated with human judgments on a range of linguistic qualities.
  - [HumanEval: An Evaluation Benchmark for Language Understanding](https://github.com/google-research-datasets/humaneval) by Gabriel Ilharco, Daniel Loureiro, Pedro Rodriguez, and Afonso Mendes.
5. - **FLORES-101:** A dataset of parallel sentences in 101 languages, designed for multilingual machine translation.
  - [FLORES-101: A Massively Multilingual Parallel Corpus for Language Understanding](https://flores101.opennmt.net/) by Aman Madaan, Shruti Rijhwani, Raghav Gupta, and Mitesh M. Khapra.
6. - **CrowS-Pairs:** A dataset of sentence pairs, designed for evaluating the plausibility of generated text.
  - [CrowS-Pairs: A Challenge Dataset for Plausible Plausibility Judgments](https://github.com/stanford-cogsci/crows-pairs) by Andrea Madotto, Zhaojiang Lin, Chien-Sheng Wu, Pascale Fung, and Caiming Xiong.
7. - **WikiLingua:** A dataset of parallel sentences in 75 languages, sourced from Wikipedia.
  - [WikiLingua: A New Benchmark Dataset for Cross-Lingual Wikification](https://arxiv.org/abs/2105.08031) by Jiarui Yao, Yanqiao Zhu, Ruihan Bao, Guosheng Lin, Lidong Bing, and Bei Shi.
8. - **MTEB:** A dataset of English sentences, annotated with their entailment relationships with respect to other sentences.
  - [Multi-Task Evaluation Benchmark for Natural Language Inference](https://github.com/google-research-datasets/mteb) by MichaΕ‚ Lukasik, Marcin Junczys-Dowmunt, and Houda Bouamor.
9. - **xP3:** A dataset of English sentences, annotated with their paraphrase relationships with respect to other sentences.
  - [xP3: A Large-Scale Evaluation Benchmark for Paraphrase Identification in Context](https://github.com/nyu-dl/xp3) by Aniket Didolkar, James Mayfield, Markus Saers, and Jason Baldridge.
10. - **DiaBLa:** A dataset of English dialogue, annotated with dialogue acts.
  - [A Large-Scale Corpus for Conversation Disentanglement](https://github.com/HLTCHKUST/DiaBLA) by Samuel Broscheit, AntΓ³nio Branco, and AndrΓ© F. T. Martins.
  
- πŸ“š Dataset Papers with Code
  1. [Universal Dependencies](https://paperswithcode.com/dataset/universal-dependencies)
  2. [WMT 2014](https://paperswithcode.com/dataset/wmt-2014)
  3. [The Pile](https://paperswithcode.com/dataset/the-pile)
  4. [HumanEval](https://paperswithcode.com/dataset/humaneval)
  5. [FLORES-101](https://paperswithcode.com/dataset/flores-101)
  6. [CrowS-Pairs](https://paperswithcode.com/dataset/crows-pairs)
  7. [WikiLingua](https://paperswithcode.com/dataset/wikilingua)
  8. [MTEB](https://paperswithcode.com/dataset/mteb)
  9. [xP3](https://paperswithcode.com/dataset/xp3)
  10. [DiaBLa](https://paperswithcode.com/dataset/diabla)
  
# Deep RL ML Strategy 🧠
The AI strategies are:
- Language Model Preparation using Human Augmented with Supervised Fine Tuning πŸ€–
- Reward Model Training with Prompts Dataset Multi-Model Generate Data to Rank 🎁
- Fine Tuning with Reinforcement Reward and Distance Distribution Regret Score 🎯
- Proximal Policy Optimization Fine Tuning 🀝
- Variations - Preference Model Pretraining πŸ€”
- Use Ranking Datasets Sentiment - Thumbs Up/Down, Distribution πŸ“Š
- Online Version Getting Feedback πŸ’¬
- OpenAI - InstructGPT - Humans generate LM Training Text πŸ”
- DeepMind - Advantage Actor Critic Sparrow, GopherCite 🦜
- Reward Model Human Prefence Feedback πŸ†
For more information on specific techniques and implementations, check out the following resources:
- OpenAI's paper on [GPT-3](https://arxiv.org/abs/2005.14165) which details their Language Model Preparation approach
- DeepMind's paper on [SAC](https://arxiv.org/abs/1801.01290) which describes the Advantage Actor Critic algorithm
- OpenAI's paper on [Reward Learning](https://arxiv.org/abs/1810.06580) which explains their approach to training Reward Models
- OpenAI's blog post on [GPT-3's fine-tuning process](https://openai.com/blog/fine-tuning-gpt-3/)
"""

# An insightful and engaging self-care health care demo
demo = gr.Blocks()

with demo:
    with gr.Row():
        input_prompt = gr.Textbox(
            label="Write a self-care or health care related question to get started...",
            lines=3,
            value="Dear AI, please tell me about the importance of self-care and how it contributes to overall health and well-being.",
        )

    with gr.Row():
        generated_txt = gr.Textbox(lines=2, visible=True)

    with gr.Row():
        Thoughts = gr.Textbox(lines=4, visible=True)

    gen = gr.Button("Discover Health Insights")

    with gr.Row():
        gr.Markdown(Markdown)

    
    gen.click(
        text_generate,
        inputs=[input_prompt, generated_txt],
        outputs=[generated_txt, input_prompt, Thoughts],
    )

demo.launch(enable_queue=True, debug=True)