Spaces:
Running
Running
Update synthgen.py
Browse files- synthgen.py +85 -2
synthgen.py
CHANGED
@@ -47,8 +47,8 @@ def generate_synthetic_text(
|
|
47 |
"model": model,
|
48 |
"messages": [
|
49 |
{"role": "system", "content": system_message},
|
50 |
-
|
51 |
-
|
52 |
"extra_headers": {
|
53 |
# "HTTP-Referer": "YOUR_SITE_URL",
|
54 |
"X-Title": "SynthGen",
|
@@ -194,6 +194,89 @@ def generate_synthetic_conversation(
|
|
194 |
|
195 |
return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
# --- Main Execution (Example Usage) ---
|
199 |
if __name__ == "__main__":
|
|
|
47 |
"model": model,
|
48 |
"messages": [
|
49 |
{"role": "system", "content": system_message},
|
50 |
+
{"role": "user", "content": prompt},
|
51 |
+
],
|
52 |
"extra_headers": {
|
53 |
# "HTTP-Referer": "YOUR_SITE_URL",
|
54 |
"X-Title": "SynthGen",
|
|
|
194 |
|
195 |
return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
|
196 |
|
197 |
+
# Function to generate different types of content based on a topic
|
198 |
+
def generate_corpus_content(
|
199 |
+
topic: str,
|
200 |
+
content_type: str, # e.g., "Corpus Snippets", "Short Story", "Article"
|
201 |
+
length_param: int, # Meaning depends on type (e.g., num snippets, approx words)
|
202 |
+
model: str,
|
203 |
+
system_message_base: str = "You are a helpful assistant generating synthetic content.",
|
204 |
+
temperature: Optional[float] = 0.7,
|
205 |
+
top_p: Optional[float] = None,
|
206 |
+
max_tokens: Optional[int] = None # Use a larger default if None
|
207 |
+
) -> str:
|
208 |
+
"""
|
209 |
+
Generates different types of synthetic content based on a topic.
|
210 |
+
|
211 |
+
Args:
|
212 |
+
topic: The central topic for the content.
|
213 |
+
content_type: The type of content to generate.
|
214 |
+
length_param: A parameter controlling length/quantity (meaning depends on type).
|
215 |
+
model: The model ID.
|
216 |
+
system_message_base: Base system message (will be specialized).
|
217 |
+
temperature: Model temperature.
|
218 |
+
top_p: Model top_p.
|
219 |
+
max_tokens: Model max_tokens.
|
220 |
+
|
221 |
+
Returns:
|
222 |
+
The generated content string or an error message.
|
223 |
+
"""
|
224 |
+
|
225 |
+
prompt = ""
|
226 |
+
system_message = system_message_base # Start with base
|
227 |
+
|
228 |
+
# --- Construct Prompt based on Content Type ---
|
229 |
+
if content_type == "Corpus Snippets":
|
230 |
+
if length_param <= 0: length_param = 5 # Default number of snippets
|
231 |
+
prompt = (
|
232 |
+
f"Generate exactly {length_param} distinct text snippets related to the topic: '{topic}'. "
|
233 |
+
f"Each snippet should be a few sentences long and focus on a different aspect if possible. "
|
234 |
+
f"Present each snippet clearly, perhaps separated by a blank line or a marker like '---'."
|
235 |
+
)
|
236 |
+
system_message = "You are an AI generating diverse text snippets for a data corpus."
|
237 |
+
# Adjust max_tokens based on expected number of snippets if not set
|
238 |
+
if max_tokens is None: max_tokens = length_param * 150 # Estimate
|
239 |
+
|
240 |
+
elif content_type == "Short Story":
|
241 |
+
if length_param <= 0: length_param = 300 # Default approx words
|
242 |
+
prompt = (
|
243 |
+
f"Write a short story (approximately {length_param} words) centered around the topic: '{topic}'. "
|
244 |
+
f"The story should have a clear beginning, middle, and end."
|
245 |
+
)
|
246 |
+
system_message = "You are a creative AI writing a short story."
|
247 |
+
if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate
|
248 |
+
|
249 |
+
elif content_type == "Article":
|
250 |
+
if length_param <= 0: length_param = 500 # Default approx words
|
251 |
+
prompt = (
|
252 |
+
f"Write an informative article (approximately {length_param} words) about the topic: '{topic}'. "
|
253 |
+
f"The article should be well-structured, factual (to the best of your ability), and engaging."
|
254 |
+
)
|
255 |
+
system_message = "You are an AI assistant writing an informative article."
|
256 |
+
if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate
|
257 |
+
|
258 |
+
else:
|
259 |
+
return f"Error: Unknown content type '{content_type}'."
|
260 |
+
|
261 |
+
if not prompt:
|
262 |
+
return "Error: Could not construct a valid prompt."
|
263 |
+
|
264 |
+
# --- Call the core generation function ---
|
265 |
+
generated_text = generate_synthetic_text(
|
266 |
+
prompt=prompt,
|
267 |
+
model=model,
|
268 |
+
system_message=system_message,
|
269 |
+
temperature=temperature,
|
270 |
+
top_p=top_p,
|
271 |
+
max_tokens=max_tokens
|
272 |
+
)
|
273 |
+
|
274 |
+
# Return the result (includes potential errors from generate_synthetic_text)
|
275 |
+
# Add a title for clarity
|
276 |
+
if not generated_text.startswith("Error:"):
|
277 |
+
return f"Generated {content_type} for topic '{topic}':\n\n{generated_text}"
|
278 |
+
else:
|
279 |
+
return generated_text # Propagate the error
|
280 |
|
281 |
# --- Main Execution (Example Usage) ---
|
282 |
if __name__ == "__main__":
|