ReallyFloppyPenguin commited on
Commit
74b8bd8
·
verified ·
1 Parent(s): b4174f8

Update synthgen.py

Browse files
Files changed (1) hide show
  1. synthgen.py +85 -2
synthgen.py CHANGED
@@ -47,8 +47,8 @@ def generate_synthetic_text(
47
  "model": model,
48
  "messages": [
49
  {"role": "system", "content": system_message},
50
- {"role": "user", "content": prompt},
51
- ],
52
  "extra_headers": {
53
  # "HTTP-Referer": "YOUR_SITE_URL",
54
  "X-Title": "SynthGen",
@@ -194,6 +194,89 @@ def generate_synthetic_conversation(
194
 
195
  return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  # --- Main Execution (Example Usage) ---
199
  if __name__ == "__main__":
 
47
  "model": model,
48
  "messages": [
49
  {"role": "system", "content": system_message},
50
+ {"role": "user", "content": prompt},
51
+ ],
52
  "extra_headers": {
53
  # "HTTP-Referer": "YOUR_SITE_URL",
54
  "X-Title": "SynthGen",
 
194
 
195
  return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
196
 
197
+ # Function to generate different types of content based on a topic
198
+ def generate_corpus_content(
199
+ topic: str,
200
+ content_type: str, # e.g., "Corpus Snippets", "Short Story", "Article"
201
+ length_param: int, # Meaning depends on type (e.g., num snippets, approx words)
202
+ model: str,
203
+ system_message_base: str = "You are a helpful assistant generating synthetic content.",
204
+ temperature: Optional[float] = 0.7,
205
+ top_p: Optional[float] = None,
206
+ max_tokens: Optional[int] = None # Use a larger default if None
207
+ ) -> str:
208
+ """
209
+ Generates different types of synthetic content based on a topic.
210
+
211
+ Args:
212
+ topic: The central topic for the content.
213
+ content_type: The type of content to generate.
214
+ length_param: A parameter controlling length/quantity (meaning depends on type).
215
+ model: The model ID.
216
+ system_message_base: Base system message (will be specialized).
217
+ temperature: Model temperature.
218
+ top_p: Model top_p.
219
+ max_tokens: Model max_tokens.
220
+
221
+ Returns:
222
+ The generated content string or an error message.
223
+ """
224
+
225
+ prompt = ""
226
+ system_message = system_message_base # Start with base
227
+
228
+ # --- Construct Prompt based on Content Type ---
229
+ if content_type == "Corpus Snippets":
230
+ if length_param <= 0: length_param = 5 # Default number of snippets
231
+ prompt = (
232
+ f"Generate exactly {length_param} distinct text snippets related to the topic: '{topic}'. "
233
+ f"Each snippet should be a few sentences long and focus on a different aspect if possible. "
234
+ f"Present each snippet clearly, perhaps separated by a blank line or a marker like '---'."
235
+ )
236
+ system_message = "You are an AI generating diverse text snippets for a data corpus."
237
+ # Adjust max_tokens based on expected number of snippets if not set
238
+ if max_tokens is None: max_tokens = length_param * 150 # Estimate
239
+
240
+ elif content_type == "Short Story":
241
+ if length_param <= 0: length_param = 300 # Default approx words
242
+ prompt = (
243
+ f"Write a short story (approximately {length_param} words) centered around the topic: '{topic}'. "
244
+ f"The story should have a clear beginning, middle, and end."
245
+ )
246
+ system_message = "You are a creative AI writing a short story."
247
+ if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate
248
+
249
+ elif content_type == "Article":
250
+ if length_param <= 0: length_param = 500 # Default approx words
251
+ prompt = (
252
+ f"Write an informative article (approximately {length_param} words) about the topic: '{topic}'. "
253
+ f"The article should be well-structured, factual (to the best of your ability), and engaging."
254
+ )
255
+ system_message = "You are an AI assistant writing an informative article."
256
+ if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate
257
+
258
+ else:
259
+ return f"Error: Unknown content type '{content_type}'."
260
+
261
+ if not prompt:
262
+ return "Error: Could not construct a valid prompt."
263
+
264
+ # --- Call the core generation function ---
265
+ generated_text = generate_synthetic_text(
266
+ prompt=prompt,
267
+ model=model,
268
+ system_message=system_message,
269
+ temperature=temperature,
270
+ top_p=top_p,
271
+ max_tokens=max_tokens
272
+ )
273
+
274
+ # Return the result (includes potential errors from generate_synthetic_text)
275
+ # Add a title for clarity
276
+ if not generated_text.startswith("Error:"):
277
+ return f"Generated {content_type} for topic '{topic}':\n\n{generated_text}"
278
+ else:
279
+ return generated_text # Propagate the error
280
 
281
  # --- Main Execution (Example Usage) ---
282
  if __name__ == "__main__":